From 222678030bb721264fcee2fb992227a092b7e131 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 27 Jul 2015 01:12:25 +0200 Subject: [PATCH] DOC: improve docs on iteration --- doc/source/basics.rst | 144 ++++++++++++++++++++++++++++++++---------- pandas/core/frame.py | 78 +++++++++++++++++++---- 2 files changed, 174 insertions(+), 48 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index aae931a4b8319..d415db88b9cb6 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1151,24 +1151,81 @@ parameter that is by default ``False`` and copies the underlying data. Pass The Panel class has a related :meth:`~Panel.rename_axis` class which can rename any of its three axes. +.. _basics.iteration: + Iteration --------- -Because Series is array-like, basic iteration produces the values. Other data -structures follow the dict-like convention of iterating over the "keys" of the -objects. In short: +The behavior of basic iteration over pandas objects depends on the type. +When iterating over a Series, it is regarded as array-like, and basic iteration +produces the values. Other data structures, like DataFrame and Panel, +follow the dict-like convention of iterating over the "keys" of the +objects. + +In short, basic iteration (``for i in object``) produces: - * **Series**: values - * **DataFrame**: column labels - * **Panel**: item labels +* **Series**: values +* **DataFrame**: column labels +* **Panel**: item labels -Thus, for example: +Thus, for example, iterating over a DataFrame gives you the column names: .. ipython:: - In [0]: for col in df: - ...: print(col) - ...: + In [0]: df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)}, + ...: index=['a', 'b', 'c']) + + In [0]: for col in df: + ...: print(col) + ...: + +Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to +iterate over the (key, value) pairs. + +To iterate over the rows of a DataFrame, you can use the following methods: + +* :meth:`~DataFrame.iterrows`: Iterate over the rows of a DataFrame as (index, Series) pairs. + This converts the rows to Series objects, which can change the dtypes and has some + performance implications. +* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame as tuples of the values. + This is a lot faster as :meth:`~DataFrame.iterrows`, and is in most cases preferable to + use to iterate over the values of a DataFrame. + +.. warning:: + + Iterating through pandas objects is generally **slow**. In many cases, + iterating manually over the rows is not needed and can be avoided with + one of the following approaches: + + * Look for a *vectorized* solution: many operations can be performed using + built-in methods or numpy functions, (boolean) indexing, ... + + * When you have a function that cannot work on the full DataFrame/Series + at once, it is better to use :meth:`~DataFrame.apply` instead of iterating + over the values. See the docs on :ref:`function application `. + + * If you need to do iterative manipulations on the values but performance is + important, consider writing the inner loop using e.g. cython or numba. + See the :ref:`enhancing performance ` section for some + examples of this approach. + +.. warning:: + + You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect! + + For example, in the following case setting the value has no effect: + + .. ipython:: python + + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) + + for index, row in df.iterrows(): + row['a'] = 10 + + df iteritems ~~~~~~~~~ @@ -1176,9 +1233,9 @@ iteritems Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates through key-value pairs: - * **Series**: (index, scalar value) pairs - * **DataFrame**: (column, Series) pairs - * **Panel**: (item, DataFrame) pairs +* **Series**: (index, scalar value) pairs +* **DataFrame**: (column, Series) pairs +* **Panel**: (item, DataFrame) pairs For example: @@ -1189,22 +1246,46 @@ For example: ...: print(frame) ...: - .. _basics.iterrows: iterrows ~~~~~~~~ -New in v0.7 is the ability to iterate efficiently through rows of a -DataFrame with :meth:`~DataFrame.iterrows`. It returns an iterator yielding each +:meth:`~DataFrame.iterrows` allows you to iterate through the rows of a +DataFrame as Series objects. It returns an iterator yielding each index value along with a Series containing the data in each row: .. ipython:: - In [0]: for row_index, row in df2.iterrows(): + In [0]: for row_index, row in df.iterrows(): ...: print('%s\n%s' % (row_index, row)) ...: +.. note:: + + Because :meth:`~DataFrame.iterrows` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). For example, + + .. ipython:: python + + df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + df_orig.dtypes + row = next(df_orig.iterrows())[1] + row + + All values in ``row``, returned as a Series, are now upcasted + to floats, also the original integer value in column `x`: + + .. ipython:: python + + row['int'].dtype + df_orig['int'].dtype + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`~DataFrame.itertuples` which returns tuples of the values + and which is generally much faster as ``iterrows``. + For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python @@ -1216,36 +1297,29 @@ For instance, a contrived way to transpose the DataFrame would be: df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows())) print(df2_t) -.. note:: - - ``iterrows`` does **not** preserve dtypes across the rows (dtypes are - preserved across columns for DataFrames). For example, - - .. ipython:: python - - df_iter = pd.DataFrame([[1, 1.0]], columns=['x', 'y']) - row = next(df_iter.iterrows())[1] - print(row['x'].dtype) - print(df_iter['x'].dtype) - itertuples ~~~~~~~~~~ -The :meth:`~DataFrame.itertuples` method will return an iterator yielding a tuple for each row in the -DataFrame. The first element of the tuple will be the row's corresponding index -value, while the remaining values are the row values proper. +The :meth:`~DataFrame.itertuples` method will return an iterator +yielding a tuple for each row in the DataFrame. The first element +of the tuple will be the row's corresponding index value, +while the remaining values are the row values. For instance, .. ipython:: python - for r in df2.itertuples(): - print(r) + for row in df.itertuples(): + print(row) + +This method does not convert the row to a Series object but just returns the +values inside a tuple. Therefore, :meth:`~DataFrame.itertuples` preserves the +data type of the values and is generally faster as :meth:`~DataFrame.iterrows`. .. _basics.dt_accessors: .dt accessor -~~~~~~~~~~~~ +------------ ``Series`` has an accessor to succinctly return datetime like properties for the *values* of the Series, if its a datetime/period like Series. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d52a859086aae..f6ea00b3714af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -547,7 +547,15 @@ def _repr_html_(self): return None def iteritems(self): - """Iterator over (column, series) pairs""" + """ + Iterator over (column name, Series) pairs. + + See also + -------- + iterrows : Iterate over the rows of a DataFrame as (index, Series) pairs. + itertuples : Iterate over the rows of a DataFrame as tuples of the values. + + """ if self.columns.is_unique and hasattr(self, '_item_cache'): for k in self.columns: yield k, self._get_item_cache(k) @@ -557,25 +565,45 @@ def iteritems(self): def iterrows(self): """ - Iterate over rows of DataFrame as (index, Series) pairs. + Iterate over the rows of a DataFrame as (index, Series) pairs. Notes ----- - * ``iterrows`` does **not** preserve dtypes across the rows (dtypes - are preserved across columns for DataFrames). For example, - - >>> df = DataFrame([[1, 1.0]], columns=['x', 'y']) - >>> row = next(df.iterrows())[1] - >>> print(row['x'].dtype) - float64 - >>> print(df['x'].dtype) - int64 + 1. Because ``iterrows` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). For example, + + >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row['int'].dtype) + float64 + >>> print(df['int'].dtype) + int64 + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns tuples of the values + and which is generally faster as ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. Returns ------- it : generator A generator that iterates over the rows of the frame. + + See also + -------- + itertuples : Iterate over the rows of a DataFrame as tuples of the values. + iteritems : Iterate over (column name, Series) pairs. + """ columns = self.columns for k, v in zip(self.index, self.values): @@ -584,8 +612,32 @@ def iterrows(self): def itertuples(self, index=True): """ - Iterate over rows of DataFrame as tuples, with index value - as first element of the tuple + Iterate over the rows of DataFrame as tuples, with index value + as first element of the tuple. + + Parameters + ---------- + index : boolean, default True + If True, return the index as the first element of the tuple. + + See also + -------- + iterrows : Iterate over the rows of a DataFrame as (index, Series) pairs. + iteritems : Iterate over (column name, Series) pairs. + + Examples + -------- + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]}, index=['a', 'b']) + >>> df + col1 col2 + a 1 0.1 + b 2 0.2 + >>> for row in df.itertuples(): + ... print(row) + ('a', 1, 0.10000000000000001) + ('b', 2, 0.20000000000000001) + """ arrays = [] if index: