From 558a594061b70581b618789c8b8097904e578885 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 8 Sep 2013 23:00:09 -0400 Subject: [PATCH] ENH: Add axis and level keywords to where, so that the other argument can now be an alignable pandas object. --- doc/source/indexing.rst | 32 +++++++++++++++----- doc/source/missing_data.rst | 27 +++++++++++++++++ doc/source/release.rst | 2 ++ pandas/core/generic.py | 19 ++++++++---- pandas/core/internals.py | 59 +++++++++++++++++++++++++++++-------- pandas/core/series.py | 3 +- pandas/tests/test_frame.py | 29 ++++++++++++++++++ 7 files changed, 144 insertions(+), 27 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e3a069960ab6b..d2fd11ee43615 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -625,6 +625,18 @@ This can be done intuitively like so: df2[df2 < 0] = 0 df2 +By default, ``where`` returns a modified copy of the data. There is an +optional parameter ``inplace`` so that the original data can be modified +without creating a copy: + +.. ipython:: python + + df_orig = df.copy() + df_orig.where(df > 0, -df, inplace=True); + df_orig + +**alignment** + Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), such that partial selection with setting is possible. This is analagous to partial setting via ``.ix`` (but on the contents rather than the axis labels) @@ -635,24 +647,30 @@ partial setting via ``.ix`` (but on the contents rather than the axis labels) df2[ df2[1:4] > 0 ] = 3 df2 -By default, ``where`` returns a modified copy of the data. There is an -optional parameter ``inplace`` so that the original data can be modified -without creating a copy: +.. versionadded:: 0.13 + +Where can also accept ``axis`` and ``level`` parameters to align the input when +performing the ``where``. .. ipython:: python - df_orig = df.copy() + df2 = df.copy() + df2.where(df2>0,df2['A'],axis='index') - df_orig.where(df > 0, -df, inplace=True); +This is equivalent (but faster than) the following. - df_orig +.. ipython:: python + + df2 = df.copy() + df.apply(lambda x, y: x.where(x>0,y), y=df['A']) + +**mask** ``mask`` is the inverse boolean operation of ``where``. .. ipython:: python s.mask(s >= 0) - df.mask(df >= 0) Take Methods diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 0c8efb4e905ec..6b63032a6c659 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -205,6 +205,33 @@ To remind you, these are the available filling methods: With time series data, using pad/ffill is extremely common so that the "last known value" is available at every time point. +Filling with a PandasObject +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.12 + +You can also fill using a direct assignment with an alignable object. The +use case of this is to fill a DataFrame with the mean of that column. + +.. ipython:: python + + df = DataFrame(np.random.randn(10,3)) + df.iloc[3:5,0] = np.nan + df.iloc[4:6,1] = np.nan + df.iloc[5:8,2] = np.nan + df + + df.fillna(df.mean()) + +.. versionadded:: 0.13 + +Same result as above, but is aligning the 'fill' value which is +a Series in this case. + +.. ipython:: python + + df.where(pd.notnull(df),df.mean(),axis='columns') + .. _missing_data.dropna: Dropping axis labels with missing data: dropna diff --git a/doc/source/release.rst b/doc/source/release.rst index f32ea44ed6242..70c520b6831bc 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -102,6 +102,8 @@ Improvements to existing features tests/test_frame, tests/test_multilevel (:issue:`4732`). - Performance improvement of timesesies plotting with PeriodIndex and added test to vbench (:issue:`4705` and :issue:`4722`) + - Add ``axis`` and ``level`` keywords to ``where``, so that the ``other`` argument + can now be an alignable pandas object. API Changes ~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f4c5eb808689c..2919790300bc3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2173,6 +2173,8 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, from pandas import DataFrame, Series method = com._clean_fill_method(method) + if axis is not None: + axis = self._get_axis_number(axis) if isinstance(other, DataFrame): return self._align_frame(other, join=join, axis=axis, level=level, copy=copy, fill_value=fill_value, @@ -2262,7 +2264,8 @@ def _align_series(self, other, join='outer', axis=None, level=None, else: return left_result, right_result - def where(self, cond, other=np.nan, inplace=False, try_cast=False, raise_on_error=True): + def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): """ Return an object of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. @@ -2273,6 +2276,8 @@ def where(self, cond, other=np.nan, inplace=False, try_cast=False, raise_on_erro other : scalar or DataFrame inplace : boolean, default False Whether to perform the operation in place on the data + axis : alignment axis if needed, default None + level : alignment level if needed, default None try_cast : boolean, default False try to cast the result back to the input type (if possible), raise_on_error : boolean, default True @@ -2306,15 +2311,17 @@ def where(self, cond, other=np.nan, inplace=False, try_cast=False, raise_on_erro # align with me if other.ndim <= self.ndim: - _, other = self.align(other, join='left', fill_value=np.nan) + _, other = self.align(other, join='left', + axis=axis, level=level, + fill_value=np.nan) # if we are NOT aligned, raise as we cannot where index - if not all([ other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes) ]): + if axis is None and not all([ other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes) ]): raise InvalidIndexError # slice me out of the other else: - raise NotImplemented + raise NotImplemented("cannot align with a bigger dimensional PandasObject") elif is_list_like(other): @@ -2386,11 +2393,11 @@ def where(self, cond, other=np.nan, inplace=False, try_cast=False, raise_on_erro if inplace: # we may have different type blocks come out of putmask, so # reconstruct the block manager - self._data = self._data.putmask(cond, other, inplace=True) + self._data = self._data.putmask(cond, other, align=axis is None, inplace=True) else: new_data = self._data.where( - other, cond, raise_on_error=raise_on_error, try_cast=try_cast) + other, cond, align=axis is None, raise_on_error=raise_on_error, try_cast=try_cast) return self._constructor(new_data) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1716980813cea..91be4f42c17e4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -593,22 +593,40 @@ def setitem(self, indexer, value): return [ self ] - def putmask(self, mask, new, inplace=False): + def putmask(self, mask, new, align=True, inplace=False): """ putmask the data to the block; it is possible that we may create a new dtype of block - return the resulting block(s) """ + return the resulting block(s) + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block(s), the result of the putmask + """ new_values = self.values if inplace else self.values.copy() # may need to align the new if hasattr(new, 'reindex_axis'): - axis = getattr(new, '_info_axis_number', 0) - new = new.reindex_axis(self.items, axis=axis, copy=False).values.T + if align: + axis = getattr(new, '_info_axis_number', 0) + new = new.reindex_axis(self.items, axis=axis, copy=False).values.T + else: + new = new.values.T # may need to align the mask if hasattr(mask, 'reindex_axis'): - axis = getattr(mask, '_info_axis_number', 0) - mask = mask.reindex_axis( - self.items, axis=axis, copy=False).values.T + if align: + axis = getattr(mask, '_info_axis_number', 0) + mask = mask.reindex_axis( + self.items, axis=axis, copy=False).values.T + else: + mask = mask.values.T # if we are passed a scalar None, convert it here if not is_list_like(new) and isnull(new): @@ -616,6 +634,11 @@ def putmask(self, mask, new, inplace=False): if self._can_hold_element(new): new = self._try_cast(new) + + # pseudo-broadcast + if isinstance(new,np.ndarray) and new.ndim == self.ndim-1: + new = np.repeat(new,self.shape[-1]).reshape(self.shape) + np.putmask(new_values, mask, new) # maybe upcast me @@ -842,7 +865,7 @@ def handle_error(): return [make_block(result, self.items, self.ref_items, ndim=self.ndim, fastpath=True)] - def where(self, other, cond, raise_on_error=True, try_cast=False): + def where(self, other, cond, align=True, raise_on_error=True, try_cast=False): """ evaluate the block; return result block(s) from the result @@ -850,6 +873,7 @@ def where(self, other, cond, raise_on_error=True, try_cast=False): ---------- other : a ndarray/object cond : the condition to respect + align : boolean, perform alignment on other/cond raise_on_error : if True, raise when I can't perform the function, False by default (and just return the data that we had coming in) @@ -862,21 +886,30 @@ def where(self, other, cond, raise_on_error=True, try_cast=False): # see if we can align other if hasattr(other, 'reindex_axis'): - axis = getattr(other, '_info_axis_number', 0) - other = other.reindex_axis(self.items, axis=axis, copy=True).values + if align: + axis = getattr(other, '_info_axis_number', 0) + other = other.reindex_axis(self.items, axis=axis, copy=True).values + else: + other = other.values # make sure that we can broadcast is_transposed = False if hasattr(other, 'ndim') and hasattr(values, 'ndim'): if values.ndim != other.ndim or values.shape == other.shape[::-1]: - values = values.T - is_transposed = True + + # pseodo broadcast (its a 2d vs 1d say and where needs it in a specific direction) + if other.ndim >= 1 and values.ndim-1 == other.ndim and values.shape[0] != other.shape[0]: + other = _block_shape(other).T + else: + values = values.T + is_transposed = True # see if we can align cond if not hasattr(cond, 'shape'): raise ValueError( "where must have a condition that is ndarray like") - if hasattr(cond, 'reindex_axis'): + + if align and hasattr(cond, 'reindex_axis'): axis = getattr(cond, '_info_axis_number', 0) cond = cond.reindex_axis(self.items, axis=axis, copy=True).values else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 4f67fb1afdd5f..ef8c630a7bde8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2725,7 +2725,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): else: return self._constructor(mapped, index=self.index, name=self.name) - def align(self, other, join='outer', level=None, copy=True, + def align(self, other, join='outer', axis=None, level=None, copy=True, fill_value=None, method=None, limit=None): """ Align two Series object with the specified join method @@ -2734,6 +2734,7 @@ def align(self, other, join='outer', level=None, copy=True, ---------- other : Series join : {'outer', 'inner', 'left', 'right'}, default 'outer' + axis : None, alignment axis (is 0 for Series) level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cefe15952d329..f9756858b5d85 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7931,6 +7931,35 @@ def test_where_none(self): expected = DataFrame({'series': Series([0,1,2,3,4,5,6,7,np.nan,np.nan]) }) assert_frame_equal(df, expected) + def test_where_align(self): + + def create(): + df = DataFrame(np.random.randn(10,3)) + df.iloc[3:5,0] = np.nan + df.iloc[4:6,1] = np.nan + df.iloc[5:8,2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notnull(df),df.mean(),axis='columns') + assert_frame_equal(result, expected) + + df.where(pd.notnull(df),df.mean(),inplace=True,axis='columns') + assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x>0,y), y=df[0]) + result = df.where(df>0,df[0],axis='index') + assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where(pd.notnull(df),DataFrame(1,index=df.index,columns=df.columns)) + assert_frame_equal(result, expected) + def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0