Skip to content

Commit 5d8eb29

Browse files
committed
Merge pull request #2880 from stephenwlin/sanitize-masked-array
BUG: Series construction from MaskedArray fails for non-floating, non-object types thanks!
2 parents a27ba81 + fa1736d commit 5d8eb29

File tree

3 files changed

+126
-47
lines changed

3 files changed

+126
-47
lines changed

pandas/core/frame.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
import numpy.ma as ma
2424

2525
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
26-
_default_index, _is_sequence, _infer_dtype_from_scalar)
26+
_default_index, _maybe_upcast, _is_sequence,
27+
_infer_dtype_from_scalar)
2728
from pandas.core.generic import NDFrame
2829
from pandas.core.index import Index, MultiIndex, _ensure_index
2930
from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
@@ -390,9 +391,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
390391
mgr = self._init_dict(data, index, columns, dtype=dtype)
391392
elif isinstance(data, ma.MaskedArray):
392393
mask = ma.getmaskarray(data)
393-
datacopy, fill_value = com._maybe_upcast(data, copy=True)
394-
datacopy[mask] = fill_value
395-
mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype,
394+
if mask.any():
395+
data, fill_value = _maybe_upcast(data, copy=True)
396+
data[mask] = fill_value
397+
else:
398+
data = data.copy()
399+
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
396400
copy=copy)
397401
elif isinstance(data, np.ndarray):
398402
if data.dtype.names:
@@ -2701,7 +2705,8 @@ def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer,
27012705

27022706
return DataFrame(new_data)
27032707

2704-
def reindex_like(self, other, method=None, copy=True, limit=None):
2708+
def reindex_like(self, other, method=None, copy=True, limit=None,
2709+
fill_value=NA):
27052710
"""
27062711
Reindex DataFrame to match indices of another DataFrame, optionally
27072712
with filling logic
@@ -2724,7 +2729,8 @@ def reindex_like(self, other, method=None, copy=True, limit=None):
27242729
reindexed : DataFrame
27252730
"""
27262731
return self.reindex(index=other.index, columns=other.columns,
2727-
method=method, copy=copy, limit=limit)
2732+
method=method, copy=copy, limit=limit,
2733+
fill_value=fill_value)
27282734

27292735
truncate = generic.truncate
27302736

pandas/core/series.py

Lines changed: 50 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import numpy.ma as ma
1616

1717
from pandas.core.common import (isnull, notnull, _is_bool_indexer,
18-
_default_index, _maybe_promote,
18+
_default_index, _maybe_promote, _maybe_upcast,
1919
_asarray_tuplesafe, is_integer_dtype,
2020
_infer_dtype_from_scalar)
2121
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
@@ -88,12 +88,15 @@ def wrapper(self, other):
8888

8989
# rhs is either a timedelta or a series/ndarray
9090
if lib.is_timedelta_array(rvalues):
91-
rvalues = pa.array([ np.timedelta64(v) for v in rvalues ],dtype='timedelta64[ns]')
91+
rvalues = pa.array([np.timedelta64(v) for v in rvalues],
92+
dtype='timedelta64[ns]')
9293
dtype = 'M8[ns]'
9394
elif com.is_datetime64_dtype(rvalues):
9495
dtype = 'timedelta64[ns]'
9596
else:
96-
raise ValueError("cannot operate on a series with out a rhs of a series/ndarray of type datetime64[ns] or a timedelta")
97+
raise ValueError('cannot operate on a series with out a rhs '
98+
'of a series/ndarray of type datetime64[ns] '
99+
'or a timedelta')
97100

98101
lvalues = lvalues.view('i8')
99102
rvalues = rvalues.view('i8')
@@ -430,32 +433,32 @@ def from_array(cls, arr, index=None, name=None, copy=False):
430433

431434
def __init__(self, data=None, index=None, dtype=None, name=None,
432435
copy=False):
433-
"""One-dimensional ndarray with axis labels (including time
434-
series). Labels need not be unique but must be any hashable type. The object
435-
supports both integer- and label-based indexing and provides a host of methods
436-
for performing operations involving the index. Statistical methods from ndarray
437-
have been overridden to automatically exclude missing data (currently
438-
represented as NaN)
439-
440-
Operations between Series (+, -, /, *, **) align values based on their
441-
associated index values-- they need not be the same length. The result
442-
index will be the sorted union of the two indexes.
443-
444-
Parameters
445-
----------
446-
data : array-like, dict, or scalar value
447-
Contains data stored in Series
448-
index : array-like or Index (1d)
436+
"""
437+
One-dimensional ndarray with axis labels (including time series).
438+
Labels need not be unique but must be any hashable type. The object
439+
supports both integer- and label-based indexing and provides a host of
440+
methods for performing operations involving the index. Statistical
441+
methods from ndarray have been overridden to automatically exclude
442+
missing data (currently represented as NaN)
449443
450-
Values must be unique and hashable, same length as data. Index object
451-
(or other iterable of same length as data) Will default to
452-
np.arange(len(data)) if not provided. If both a dict and index sequence
453-
are used, the index will override the keys found in the dict.
444+
Operations between Series (+, -, /, *, **) align values based on their
445+
associated index values-- they need not be the same length. The result
446+
index will be the sorted union of the two indexes.
454447
455-
dtype : numpy.dtype or None
456-
If None, dtype will be inferred copy : boolean, default False Copy
457-
input data
458-
copy : boolean, default False
448+
Parameters
449+
----------
450+
data : array-like, dict, or scalar value
451+
Contains data stored in Series
452+
index : array-like or Index (1d)
453+
Values must be unique and hashable, same length as data. Index
454+
object (or other iterable of same length as data) Will default to
455+
np.arange(len(data)) if not provided. If both a dict and index
456+
sequence are used, the index will override the keys found in the
457+
dict.
458+
dtype : numpy.dtype or None
459+
If None, dtype will be inferred copy : boolean, default False Copy
460+
input data
461+
copy : boolean, default False
459462
"""
460463
pass
461464

@@ -769,7 +772,8 @@ def astype(self, dtype):
769772
See numpy.ndarray.astype
770773
"""
771774
casted = com._astype_nansafe(self.values, dtype)
772-
return self._constructor(casted, index=self.index, name=self.name, dtype=casted.dtype)
775+
return self._constructor(casted, index=self.index, name=self.name,
776+
dtype=casted.dtype)
773777

774778
def convert_objects(self, convert_dates=True, convert_numeric=True):
775779
"""
@@ -778,8 +782,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=True):
778782
779783
Parameters
780784
----------
781-
convert_dates : if True, attempt to soft convert_dates, if 'coerce', force conversion (and non-convertibles get NaT)
782-
convert_numeric : if True attempt to coerce to numerbers (including strings), non-convertibles get NaN
785+
convert_dates : boolean, default True
786+
if True, attempt to soft convert_dates, if 'coerce', force
787+
conversion (and non-convertibles get NaT)
788+
convert_numeric : boolean, default True
789+
if True attempt to coerce to numbers (including strings),
790+
non-convertibles get NaN
783791
784792
Returns
785793
-------
@@ -982,7 +990,8 @@ def __unicode__(self):
982990
"""
983991
Return a string representation for a particular DataFrame
984992
985-
Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
993+
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
994+
py2/py3.
986995
"""
987996
width, height = get_terminal_size()
988997
max_rows = (height if get_option("display.max_rows") == 0
@@ -2416,7 +2425,7 @@ def reindex_axis(self, labels, axis=0, **kwargs):
24162425
raise ValueError("cannot reindex series on non-zero axis!")
24172426
return self.reindex(index=labels,**kwargs)
24182427

2419-
def reindex_like(self, other, method=None, limit=None):
2428+
def reindex_like(self, other, method=None, limit=None, fill_value=pa.NA):
24202429
"""
24212430
Reindex Series to match index of another Series, optionally with
24222431
filling logic
@@ -2437,7 +2446,8 @@ def reindex_like(self, other, method=None, limit=None):
24372446
-------
24382447
reindexed : Series
24392448
"""
2440-
return self.reindex(other.index, method=method, limit=limit)
2449+
return self.reindex(other.index, method=method, limit=limit,
2450+
fill_value=fill_value)
24412451

24422452
def take(self, indices, axis=0):
24432453
"""
@@ -3060,10 +3070,14 @@ def remove_na(arr):
30603070

30613071
def _sanitize_array(data, index, dtype=None, copy=False,
30623072
raise_cast_failure=False):
3073+
30633074
if isinstance(data, ma.MaskedArray):
30643075
mask = ma.getmaskarray(data)
3065-
data = ma.copy(data)
3066-
data[mask] = pa.NA
3076+
if mask.any():
3077+
data, fill_value = _maybe_upcast(data, copy=True)
3078+
data[mask] = fill_value
3079+
else:
3080+
data = data.copy()
30673081

30683082
def _try_cast(arr):
30693083
try:
@@ -3112,7 +3126,7 @@ def _try_cast(arr):
31123126
raise
31133127
subarr = pa.array(data, dtype=object, copy=copy)
31143128
subarr = lib.maybe_convert_objects(subarr)
3115-
3129+
31163130
else:
31173131
subarr = com._possibly_convert_platform(data)
31183132

pandas/tests/test_series.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def _skip_if_no_pytz():
4040
except ImportError:
4141
raise nose.SkipTest
4242

43-
#-------------------------------------------------------------------------------
43+
#------------------------------------------------------------------------------
4444
# Series test cases
4545

4646
JOIN_TYPES = ['inner', 'outer', 'left', 'right']
@@ -342,6 +342,65 @@ def test_constructor_maskedarray(self):
342342
expected = Series([0.0, nan, 2.0], index=index)
343343
assert_series_equal(result, expected)
344344

345+
data[1] = 1.0
346+
result = Series(data, index=index)
347+
expected = Series([0.0, 1.0, 2.0], index=index)
348+
assert_series_equal(result, expected)
349+
350+
data = ma.masked_all((3,), dtype=int)
351+
result = Series(data)
352+
expected = Series([nan, nan, nan], dtype=float)
353+
assert_series_equal(result, expected)
354+
355+
data[0] = 0
356+
data[2] = 2
357+
index = ['a', 'b', 'c']
358+
result = Series(data, index=index)
359+
expected = Series([0, nan, 2], index=index, dtype=float)
360+
assert_series_equal(result, expected)
361+
362+
data[1] = 1
363+
result = Series(data, index=index)
364+
expected = Series([0, 1, 2], index=index, dtype=int)
365+
assert_series_equal(result, expected)
366+
367+
data = ma.masked_all((3,), dtype=bool)
368+
result = Series(data)
369+
expected = Series([nan, nan, nan], dtype=object)
370+
assert_series_equal(result, expected)
371+
372+
data[0] = True
373+
data[2] = False
374+
index = ['a', 'b', 'c']
375+
result = Series(data, index=index)
376+
expected = Series([True, nan, False], index=index, dtype=object)
377+
assert_series_equal(result, expected)
378+
379+
data[1] = True
380+
result = Series(data, index=index)
381+
expected = Series([True, True, False], index=index, dtype=bool)
382+
assert_series_equal(result, expected)
383+
384+
from pandas import tslib
385+
data = ma.masked_all((3,), dtype='M8[ns]')
386+
result = Series(data)
387+
expected = Series([tslib.iNaT, tslib.iNaT, tslib.iNaT], dtype='M8[ns]')
388+
assert_series_equal(result, expected)
389+
390+
data[0] = datetime(2001, 1, 1)
391+
data[2] = datetime(2001, 1, 3)
392+
index = ['a', 'b', 'c']
393+
result = Series(data, index=index)
394+
expected = Series([datetime(2001, 1, 1), tslib.iNaT,
395+
datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
396+
assert_series_equal(result, expected)
397+
398+
data[1] = datetime(2001, 1, 2)
399+
result = Series(data, index=index)
400+
expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2),
401+
datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
402+
assert_series_equal(result, expected)
403+
345404
def test_constructor_default_index(self):
346405
s = Series([0, 1, 2])
347406
assert_almost_equal(s.index, np.arange(3))
@@ -2922,7 +2981,7 @@ def test_convert_objects(self):
29222981
result = s.convert_objects(convert_dates=True,convert_numeric=False)
29232982
expected = Series([Timestamp('20010101'),Timestamp('20010102'),Timestamp('20010103')],dtype='M8[ns]')
29242983
assert_series_equal(expected,result)
2925-
2984+
29262985
result = s.convert_objects(convert_dates='coerce',convert_numeric=False)
29272986
assert_series_equal(expected,result)
29282987
result = s.convert_objects(convert_dates='coerce',convert_numeric=True)
@@ -3306,7 +3365,7 @@ def test_fillna_int(self):
33063365
s.fillna(method='ffill', inplace=True)
33073366
assert_series_equal(s.fillna(method='ffill', inplace=False), s)
33083367

3309-
#-------------------------------------------------------------------------------
3368+
#------------------------------------------------------------------------------
33103369
# TimeSeries-specific
33113370

33123371
def test_fillna(self):
@@ -3569,7 +3628,7 @@ def test_mpl_compat_hack(self):
35693628
expected = self.ts.values[:, np.newaxis]
35703629
assert_almost_equal(result, expected)
35713630

3572-
#-------------------------------------------------------------------------------
3631+
#------------------------------------------------------------------------------
35733632
# GroupBy
35743633

35753634
def test_select(self):
@@ -3582,7 +3641,7 @@ def test_select(self):
35823641
expected = self.ts[self.ts.weekday == 2]
35833642
assert_series_equal(result, expected)
35843643

3585-
#----------------------------------------------------------------------
3644+
#------------------------------------------------------------------------------
35863645
# Misc not safe for sparse
35873646

35883647
def test_dropna_preserve_name(self):

0 commit comments

Comments
 (0)