Merge pull request #2880 from stephenwlin/sanitize-masked-array

jreback · jreback · commit 5d8eb299cc11 · 2013-02-23T11:51:31.000-08:00
BUG: Series construction from MaskedArray fails for non-floating, non-object types

thanks!
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -23,7 +23,8 @@
 import numpy.ma as ma
 
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
-                                _default_index, _is_sequence, _infer_dtype_from_scalar)
+                                _default_index, _maybe_upcast, _is_sequence,
+                                _infer_dtype_from_scalar)
 from pandas.core.generic import NDFrame
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
@@ -390,9 +391,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
             mgr = self._init_dict(data, index, columns, dtype=dtype)
         elif isinstance(data, ma.MaskedArray):
             mask = ma.getmaskarray(data)
-            datacopy, fill_value = com._maybe_upcast(data, copy=True)
-            datacopy[mask] = fill_value
-            mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype,
+            if mask.any():
+                data, fill_value = _maybe_upcast(data, copy=True)
+                data[mask] = fill_value
+            else:
+                data = data.copy()
+            mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                      copy=copy)
         elif isinstance(data, np.ndarray):
             if data.dtype.names:
@@ -2701,7 +2705,8 @@ def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer,
 
         return DataFrame(new_data)
 
-    def reindex_like(self, other, method=None, copy=True, limit=None):
+    def reindex_like(self, other, method=None, copy=True, limit=None,
+                     fill_value=NA):
         """
         Reindex DataFrame to match indices of another DataFrame, optionally
         with filling logic
@@ -2724,7 +2729,8 @@ def reindex_like(self, other, method=None, copy=True, limit=None):
         reindexed : DataFrame
         """
         return self.reindex(index=other.index, columns=other.columns,
-                            method=method, copy=copy, limit=limit)
+                            method=method, copy=copy, limit=limit,
+                            fill_value=fill_value)
 
     truncate = generic.truncate
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -15,7 +15,7 @@
 import numpy.ma as ma
 
 from pandas.core.common import (isnull, notnull, _is_bool_indexer,
-                                _default_index, _maybe_promote,
+                                _default_index, _maybe_promote, _maybe_upcast,
                                 _asarray_tuplesafe, is_integer_dtype,
                                 _infer_dtype_from_scalar)
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
@@ -88,12 +88,15 @@ def wrapper(self, other):
 
             # rhs is either a timedelta or a series/ndarray
             if lib.is_timedelta_array(rvalues):
-                rvalues = pa.array([ np.timedelta64(v) for v in rvalues ],dtype='timedelta64[ns]')
+                rvalues = pa.array([np.timedelta64(v) for v in rvalues],
+                                   dtype='timedelta64[ns]')
                 dtype = 'M8[ns]'
             elif com.is_datetime64_dtype(rvalues):
                 dtype = 'timedelta64[ns]'
             else:
-                raise ValueError("cannot operate on a series with out a rhs of a series/ndarray of type datetime64[ns] or a timedelta")
+                raise ValueError('cannot operate on a series with out a rhs '
+                                 'of a series/ndarray of type datetime64[ns] '
+                                 'or a timedelta')
 
             lvalues = lvalues.view('i8')
             rvalues = rvalues.view('i8')
@@ -430,32 +433,32 @@ def from_array(cls, arr, index=None, name=None, copy=False):
 
     def __init__(self, data=None, index=None, dtype=None, name=None,
                  copy=False):
-        """One-dimensional ndarray with axis labels (including time
-series). Labels need not be unique but must be any hashable type. The object
-supports both integer- and label-based indexing and provides a host of methods
-for performing operations involving the index. Statistical methods from ndarray
-have been overridden to automatically exclude missing data (currently
-represented as NaN)
-
-Operations between Series (+, -, /, *, **) align values based on their
-associated index values-- they need not be the same length. The result
-index will be the sorted union of the two indexes.
-
-Parameters
-----------
-data : array-like, dict, or scalar value
-    Contains data stored in Series
-index : array-like or Index (1d)
+        """
+        One-dimensional ndarray with axis labels (including time series).
+        Labels need not be unique but must be any hashable type. The object
+        supports both integer- and label-based indexing and provides a host of
+        methods for performing operations involving the index. Statistical
+        methods from ndarray have been overridden to automatically exclude
+        missing data (currently represented as NaN)
 
-    Values must be unique and hashable, same length as data. Index object
-    (or other iterable of same length as data) Will default to
-    np.arange(len(data)) if not provided. If both a dict and index sequence
-    are used, the index will override the keys found in the dict.
+        Operations between Series (+, -, /, *, **) align values based on their
+        associated index values-- they need not be the same length. The result
+        index will be the sorted union of the two indexes.
 
-dtype : numpy.dtype or None
-    If None, dtype will be inferred copy : boolean, default False Copy
-    input data
-copy : boolean, default False
+        Parameters
+        ----------
+        data : array-like, dict, or scalar value
+            Contains data stored in Series
+        index : array-like or Index (1d)
+            Values must be unique and hashable, same length as data. Index
+            object (or other iterable of same length as data) Will default to
+            np.arange(len(data)) if not provided. If both a dict and index
+            sequence are used, the index will override the keys found in the
+            dict.
+        dtype : numpy.dtype or None
+            If None, dtype will be inferred copy : boolean, default False Copy
+            input data
+        copy : boolean, default False
         """
         pass
 
@@ -769,7 +772,8 @@ def astype(self, dtype):
         See numpy.ndarray.astype
         """
         casted = com._astype_nansafe(self.values, dtype)
-        return self._constructor(casted, index=self.index, name=self.name, dtype=casted.dtype)
+        return self._constructor(casted, index=self.index, name=self.name,
+                                 dtype=casted.dtype)
 
     def convert_objects(self, convert_dates=True, convert_numeric=True):
         """
@@ -778,8 +782,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=True):
 
         Parameters
         ----------
-        convert_dates : if True, attempt to soft convert_dates, if 'coerce', force conversion (and non-convertibles get NaT)
-        convert_numeric : if True attempt to coerce to numerbers (including strings), non-convertibles get NaN
+        convert_dates : boolean, default True
+            if True, attempt to soft convert_dates, if 'coerce', force
+            conversion (and non-convertibles get NaT)
+        convert_numeric : boolean, default True
+            if True attempt to coerce to numbers (including strings),
+            non-convertibles get NaN
 
         Returns
         -------
@@ -982,7 +990,8 @@ def __unicode__(self):
         """
         Return a string representation for a particular DataFrame
 
-        Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
+        Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+        py2/py3.
         """
         width, height = get_terminal_size()
         max_rows = (height if get_option("display.max_rows") == 0
@@ -2416,7 +2425,7 @@ def reindex_axis(self, labels, axis=0, **kwargs):
             raise ValueError("cannot reindex series on non-zero axis!")
         return self.reindex(index=labels,**kwargs)
 
-    def reindex_like(self, other, method=None, limit=None):
+    def reindex_like(self, other, method=None, limit=None, fill_value=pa.NA):
         """
         Reindex Series to match index of another Series, optionally with
         filling logic
@@ -2437,7 +2446,8 @@ def reindex_like(self, other, method=None, limit=None):
         -------
         reindexed : Series
         """
-        return self.reindex(other.index, method=method, limit=limit)
+        return self.reindex(other.index, method=method, limit=limit,
+                            fill_value=fill_value)
 
     def take(self, indices, axis=0):
         """
@@ -3060,10 +3070,14 @@ def remove_na(arr):
 
 def _sanitize_array(data, index, dtype=None, copy=False,
                     raise_cast_failure=False):
+
     if isinstance(data, ma.MaskedArray):
         mask = ma.getmaskarray(data)
-        data = ma.copy(data)
-        data[mask] = pa.NA
+        if mask.any():
+            data, fill_value = _maybe_upcast(data, copy=True)
+            data[mask] = fill_value
+        else:
+            data = data.copy()
 
     def _try_cast(arr):
         try:
@@ -3112,7 +3126,7 @@ def _try_cast(arr):
                     raise
                 subarr = pa.array(data, dtype=object, copy=copy)
                 subarr = lib.maybe_convert_objects(subarr)
-            
+
         else:
             subarr = com._possibly_convert_platform(data)
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -40,7 +40,7 @@ def _skip_if_no_pytz():
     except ImportError:
         raise nose.SkipTest
 
-#-------------------------------------------------------------------------------
+#------------------------------------------------------------------------------
 # Series test cases
 
 JOIN_TYPES = ['inner', 'outer', 'left', 'right']
@@ -342,6 +342,65 @@ def test_constructor_maskedarray(self):
         expected = Series([0.0, nan, 2.0], index=index)
         assert_series_equal(result, expected)
 
+        data[1] = 1.0
+        result = Series(data, index=index)
+        expected = Series([0.0, 1.0, 2.0], index=index)
+        assert_series_equal(result, expected)
+
+        data = ma.masked_all((3,), dtype=int)
+        result = Series(data)
+        expected = Series([nan, nan, nan], dtype=float)
+        assert_series_equal(result, expected)
+
+        data[0] = 0
+        data[2] = 2
+        index = ['a', 'b', 'c']
+        result = Series(data, index=index)
+        expected = Series([0, nan, 2], index=index, dtype=float)
+        assert_series_equal(result, expected)
+
+        data[1] = 1
+        result = Series(data, index=index)
+        expected = Series([0, 1, 2], index=index, dtype=int)
+        assert_series_equal(result, expected)
+
+        data = ma.masked_all((3,), dtype=bool)
+        result = Series(data)
+        expected = Series([nan, nan, nan], dtype=object)
+        assert_series_equal(result, expected)
+
+        data[0] = True
+        data[2] = False
+        index = ['a', 'b', 'c']
+        result = Series(data, index=index)
+        expected = Series([True, nan, False], index=index, dtype=object)
+        assert_series_equal(result, expected)
+
+        data[1] = True
+        result = Series(data, index=index)
+        expected = Series([True, True, False], index=index, dtype=bool)
+        assert_series_equal(result, expected)
+
+        from pandas import tslib
+        data = ma.masked_all((3,), dtype='M8[ns]')
+        result = Series(data)
+        expected = Series([tslib.iNaT, tslib.iNaT, tslib.iNaT], dtype='M8[ns]')
+        assert_series_equal(result, expected)
+
+        data[0] = datetime(2001, 1, 1)
+        data[2] = datetime(2001, 1, 3)
+        index = ['a', 'b', 'c']
+        result = Series(data, index=index)
+        expected = Series([datetime(2001, 1, 1), tslib.iNaT,
+                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
+        assert_series_equal(result, expected)
+
+        data[1] = datetime(2001, 1, 2)
+        result = Series(data, index=index)
+        expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2),
+                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
+        assert_series_equal(result, expected)
+
     def test_constructor_default_index(self):
         s = Series([0, 1, 2])
         assert_almost_equal(s.index, np.arange(3))
@@ -2922,7 +2981,7 @@ def test_convert_objects(self):
         result = s.convert_objects(convert_dates=True,convert_numeric=False)
         expected = Series([Timestamp('20010101'),Timestamp('20010102'),Timestamp('20010103')],dtype='M8[ns]')
         assert_series_equal(expected,result)
-        
+
         result = s.convert_objects(convert_dates='coerce',convert_numeric=False)
         assert_series_equal(expected,result)
         result = s.convert_objects(convert_dates='coerce',convert_numeric=True)
@@ -3306,7 +3365,7 @@ def test_fillna_int(self):
         s.fillna(method='ffill', inplace=True)
         assert_series_equal(s.fillna(method='ffill', inplace=False), s)
 
-#-------------------------------------------------------------------------------
+#------------------------------------------------------------------------------
 # TimeSeries-specific
 
     def test_fillna(self):
@@ -3569,7 +3628,7 @@ def test_mpl_compat_hack(self):
         expected = self.ts.values[:, np.newaxis]
         assert_almost_equal(result, expected)
 
-#-------------------------------------------------------------------------------
+#------------------------------------------------------------------------------
 # GroupBy
 
     def test_select(self):
@@ -3582,7 +3641,7 @@ def test_select(self):
         expected = self.ts[self.ts.weekday == 2]
         assert_series_equal(result, expected)
 
-#----------------------------------------------------------------------
+#------------------------------------------------------------------------------
 # Misc not safe for sparse
 
     def test_dropna_preserve_name(self):