From 2c9e2ea1cfb77e38af570fb1ff4c53a730a4e731 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 2 Dec 2018 16:47:11 +0100 Subject: [PATCH 01/12] DEPR: switch default of skipna-kwarg in infer_dtype to True --- pandas/_libs/lib.pyx | 18 +++---- pandas/conftest.py | 14 +++--- pandas/core/algorithms.py | 4 +- pandas/core/arrays/integer.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/io/sql.py | 23 +++------ pandas/io/stata.py | 2 +- pandas/tests/dtypes/test_inference.py | 67 +++++++++++++++------------ pandas/tests/test_strings.py | 23 +++++---- 10 files changed, 78 insertions(+), 79 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0c081986d83c5..20ae1de304c9d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -622,7 +622,7 @@ def clean_index_list(obj: list): return obj, all_arrays # don't force numpy coerce with nan's - inferred = infer_dtype(obj) + inferred = infer_dtype(obj, skipna=False) if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: @@ -1078,7 +1078,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(value: object, skipna: bool=False) -> str: +def infer_dtype(value: object, skipna: bool=True) -> str: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -1086,11 +1086,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str: Parameters ---------- value : scalar, list, ndarray, or pandas type - skipna : bool, default False - Ignore NaN values when inferring the type. The default of ``False`` - will be deprecated in a later version of pandas. + skipna : bool, default True + Ignore NaN values when inferring the type. .. versionadded:: 0.21.0 + .. versionchanged:: 0.24.0 + Switched default of ``skipna`` to ``True`` Returns ------- @@ -1209,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str: values = construct_1d_object_array_from_listlike(value) values = getattr(values, 'values', values) + + # make contiguous + values = values.ravel() + if skipna: values = values[~isnaobj(values)] @@ -1219,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str: if values.dtype != np.object_: values = values.astype('O') - # make contiguous - values = values.ravel() - n = len(values) if n == 0: return 'empty' diff --git a/pandas/conftest.py b/pandas/conftest.py index 20f97bdec1107..c833e8abaeb9e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -545,7 +545,7 @@ def any_numpy_dtype(request): # categoricals are handled separately -_any_skipna_inferred_dtype = [ +_any_inferred_dtype = [ ('string', ['a', np.nan, 'c']), ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), @@ -570,11 +570,11 @@ def any_numpy_dtype(request): ('time', [time(1), np.nan, time(2)]), ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] -ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id +ids, _ = zip(*_any_inferred_dtype) # use inferred type as fixture-id -@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) -def any_skipna_inferred_dtype(request): +@pytest.fixture(params=_any_inferred_dtype, ids=ids) +def any_inferred_dtype(request): """ Fixture for all inferred dtypes from _libs.lib.infer_dtype @@ -610,10 +610,10 @@ def any_skipna_inferred_dtype(request): -------- >>> import pandas._libs.lib as lib >>> - >>> def test_something(any_skipna_inferred_dtype): - ... inferred_dtype, values = any_skipna_inferred_dtype + >>> def test_something(any_inferred_dtype): + ... inferred_dtype, values = any_inferred_dtype ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... assert lib.infer_dtype(values) == inferred_dtype """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1a4368ee8ea98..1c74e4d7a3359 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -203,7 +203,7 @@ def _get_hashtable_algo(values): if ndtype == 'object': # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: + if lib.infer_dtype(values, skipna=False) in ['string']: ndtype = 'string' else: ndtype = 'object' @@ -221,7 +221,7 @@ def _get_data_algo(values, func_map): if ndtype == 'object': # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: + if lib.infer_dtype(values, skipna=False) in ['string']: ndtype = 'string' f = func_map.get(ndtype, func_map['object']) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e9d51aaea4218..f42152a765d08 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -169,7 +169,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values) - if inferred_type is 'mixed' and isna(values).all(): + if inferred_type == 'empty': values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in ['floating', 'integer', diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index eae9eb97f35fe..57c3f26a67c7e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -940,7 +940,7 @@ def try_timedelta(v): # e.g. '00:00:01' is a timedelta but # technically is also a datetime value = try_timedelta(v) - if lib.infer_dtype(value) in ['mixed']: + if lib.infer_dtype(value, skipna=False) in ['mixed']: value = try_datetime(v) return value diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e850db4178f41..c0587cc7a8d7c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -409,7 +409,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, subarr = subarr.copy() if dtype is None: - inferred = lib.infer_dtype(subarr) + inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'integer': try: return cls._try_convert_to_int_index( diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e65e3dff1936a..eeccd51a79d88 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -819,27 +819,15 @@ def _harmonize_columns(self, parse_dates=None): except KeyError: pass # this column not in results - def _get_notna_col_dtype(self, col): - """ - Infer datatype of the Series col. In case the dtype of col is 'object' - and it contains NA values, this infers the datatype of the not-NA - values. Needed for inserting typed data containing NULLs, GH8778. - """ - col_for_inference = col - if col.dtype == 'object': - notnadata = col[~isna(col)] - if len(notnadata): - col_for_inference = notnadata - - return lib.infer_dtype(col_for_inference) - def _sqlalchemy_type(self, col): dtype = self.dtype or {} if col.name in dtype: return self.dtype[col.name] - col_type = self._get_notna_col_dtype(col) + # Infer type of column, while ignoring missing values. + # Needed for inserting typed data containing NULLs, GH 8778. + col_type = lib.infer_dtype(col) from sqlalchemy.types import (BigInteger, Integer, Float, Text, Boolean, @@ -1325,7 +1313,10 @@ def _sql_type_name(self, col): if col.name in dtype: return dtype[col.name] - col_type = self._get_notna_col_dtype(col) + # Infer type of column, while ignoring missing values. + # Needed for inserting typed data containing NULLs, GH 8778. + col_type = lib.infer_dtype(col) + if col_type == 'timedelta64': warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 403137b695cb7..0f3e129610d60 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1865,7 +1865,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, if force_strl: return '%9s' if dtype.type == np.object_: - inferred_dtype = infer_dtype(column.dropna()) + inferred_dtype = infer_dtype(column) if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Column `{col}` cannot be exported.\n\nOnly ' diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0c22b595bc74d..5ccbffa3656a3 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -496,12 +496,12 @@ class TestTypeInference(object): class Dummy(): pass - def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): + def test_inferred_dtype_fixture(self, any_inferred_dtype): # see pandas/conftest.py - inferred_dtype, values = any_skipna_inferred_dtype + inferred_dtype, values = any_inferred_dtype # make sure the inferred dtype of the fixture is as requested - assert inferred_dtype == lib.infer_dtype(values, skipna=True) + assert inferred_dtype == lib.infer_dtype(values) def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) @@ -547,9 +547,12 @@ def test_bools(self): assert result == 'boolean' arr = np.array([True, np.nan, False], dtype='O') - result = lib.infer_dtype(arr, skipna=True) + result = lib.infer_dtype(arr) assert result == 'boolean' + result = lib.infer_dtype(arr, skipna=False) + assert result == 'mixed' + def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') result = lib.infer_dtype(arr) @@ -591,11 +594,11 @@ def test_string(self): def test_unicode(self): arr = [u'a', np.nan, u'c'] - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' arr = [u'a', np.nan, u'c'] - result = lib.infer_dtype(arr, skipna=True) + result = lib.infer_dtype(arr) expected = 'unicode' if PY2 else 'string' assert result == expected @@ -657,11 +660,11 @@ def test_infer_dtype_datetime(self): # different type of nat arr = np.array([np.timedelta64('nat'), np.datetime64('2011-01-02')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.datetime64('2011-01-02'), np.timedelta64('nat')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' # mixed datetime arr = np.array([datetime(2011, 1, 1), @@ -722,11 +725,11 @@ def test_infer_dtype_timedelta(self): # different type of nat arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' def test_infer_dtype_period(self): # GH 13664 @@ -749,11 +752,11 @@ def test_infer_dtype_period(self): # different type of nat arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' @pytest.mark.parametrize( "data", @@ -827,56 +830,58 @@ def test_infer_dtype_all_nan_nat_like(self): # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr) == 'empty' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([None, np.nan, np.nan]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr) == 'empty' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' # pd.NaT arr = np.array([pd.NaT]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([pd.NaT, np.nan]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([np.nan, pd.NaT]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([np.nan, pd.NaT, np.nan]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == 'datetime' arr = np.array([None, pd.NaT, None]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == 'datetime' # np.datetime64(nat) arr = np.array([np.datetime64('nat')]) - assert lib.infer_dtype(arr) == 'datetime64' + assert lib.infer_dtype(arr, skipna=False) == 'datetime64' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.datetime64('nat'), n]) - assert lib.infer_dtype(arr) == 'datetime64' + assert lib.infer_dtype(arr, skipna=False) == 'datetime64' arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) - assert lib.infer_dtype(arr) == 'datetime64' + assert lib.infer_dtype(arr, skipna=False) == 'datetime64' arr = np.array([np.timedelta64('nat')], dtype=object) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=False) == 'timedelta' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.timedelta64('nat'), n]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=False) == 'timedelta' arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=False) == 'timedelta' # datetime / timedelta mixed arr = np.array([pd.NaT, np.datetime64('nat'), np.timedelta64('nat'), np.nan]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], dtype=object) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=False) == 'mixed' def test_is_datetimelike_array_all_nan_nat_like(self): arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) @@ -940,10 +945,10 @@ def test_date(self): assert index.inferred_type == 'date' dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] - result = lib.infer_dtype(dates) + result = lib.infer_dtype(dates, skipna=False) assert result == 'mixed' - result = lib.infer_dtype(dates, skipna=True) + result = lib.infer_dtype(dates) assert result == 'date' def test_is_numeric_array(self): @@ -984,8 +989,10 @@ def test_object(self): # GH 7431 # cannot infer more than this as only a single element arr = np.array([None], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' + result = lib.infer_dtype(arr) + assert result == 'empty' def test_to_object_array_width(self): # see gh-13320 diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 117984ce89743..31694d6b7ae50 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -119,7 +119,7 @@ def any_string_method(request): # subset of the full set from pandas/conftest.py -_any_allowed_skipna_inferred_dtype = [ +_any_allowed_inferred_dtype = [ ('string', ['a', np.nan, 'c']), ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), @@ -127,11 +127,11 @@ def any_string_method(request): ('empty', []), ('mixed-integer', ['a', np.nan, 2]) ] -ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id +ids, _ = zip(*_any_allowed_inferred_dtype) # use inferred type as id -@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) -def any_allowed_skipna_inferred_dtype(request): +@pytest.fixture(params=_any_allowed_inferred_dtype, ids=ids) +def any_allowed_inferred_dtype(request): """ Fixture for all (inferred) dtypes allowed in StringMethods.__init__ @@ -155,10 +155,10 @@ def any_allowed_skipna_inferred_dtype(request): -------- >>> import pandas._libs.lib as lib >>> - >>> def test_something(any_allowed_skipna_inferred_dtype): - ... inferred_dtype, values = any_skipna_inferred_dtype + >>> def test_something(any_allowed_inferred_dtype): + ... inferred_dtype, values = any_allowed_inferred_dtype ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... assert lib.infer_dtype(values) == inferred_dtype """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -177,9 +177,9 @@ def test_api(self): @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): + def test_api_per_dtype(self, box, dtype, any_inferred_dtype): # one instance of parametrized fixture - inferred_dtype, values = any_skipna_inferred_dtype + inferred_dtype, values = any_inferred_dtype t = box(values, dtype=dtype) # explicit dtype to avoid casting @@ -217,15 +217,14 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_method(self, box, dtype, - any_allowed_skipna_inferred_dtype, + def test_api_per_method(self, box, dtype, any_allowed_inferred_dtype, any_string_method): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others # one instance of each parametrized fixture - inferred_dtype, values = any_allowed_skipna_inferred_dtype + inferred_dtype, values = any_allowed_inferred_dtype method_name, args, kwargs = any_string_method # TODO: get rid of these xfails From 21f79639caf99f4e47b72160f5633c658a11f121 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 6 Dec 2018 01:21:45 +0100 Subject: [PATCH 02/12] Change default of skipna to None & warn; add whatsnew --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/lib.pyx | 13 ++- pandas/conftest.py | 14 +-- pandas/core/algorithms.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/dtypes/cast.py | 5 +- pandas/core/dtypes/common.py | 3 +- pandas/core/dtypes/missing.py | 2 +- pandas/core/indexes/base.py | 6 +- pandas/core/indexes/multi.py | 3 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/merge.py | 3 +- pandas/core/reshape/tile.py | 2 +- pandas/core/series.py | 4 +- pandas/core/sorting.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/pytables.py | 12 +-- pandas/io/sql.py | 4 +- pandas/io/stata.py | 4 +- pandas/tests/dtypes/test_inference.py | 131 ++++++++++++++------------ pandas/tests/test_strings.py | 23 ++--- 23 files changed, 131 insertions(+), 113 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4e12b22c8ccac..ae2dc7927bed5 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1136,6 +1136,7 @@ Deprecations - :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`) - Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`) - Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). +- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`) .. _whatsnew_0240.deprecations.datetimelike_int_ops: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 20ae1de304c9d..155cb4f13b0f7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -4,6 +4,7 @@ from fractions import Fraction from numbers import Number import sys +import warnings import cython from cython import Py_ssize_t @@ -1078,7 +1079,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(value: object, skipna: bool=True) -> str: +def infer_dtype(value: object, skipna: object=None) -> str: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -1086,12 +1087,10 @@ def infer_dtype(value: object, skipna: bool=True) -> str: Parameters ---------- value : scalar, list, ndarray, or pandas type - skipna : bool, default True + skipna : bool, default None Ignore NaN values when inferring the type. .. versionadded:: 0.21.0 - .. versionchanged:: 0.24.0 - Switched default of ``skipna`` to ``True`` Returns ------- @@ -1186,6 +1185,12 @@ def infer_dtype(value: object, skipna: bool=True) -> str: bint seen_pdnat = False bint seen_val = False + if skipna is None: + msg = ('A future version of pandas will default to `skipna=True`. To ' + 'silence this warning, pass `skipna=True|False` explicitly.') + warnings.warn(msg, FutureWarning, stacklevel=2) + skipna = False + if util.is_array(value): values = value elif hasattr(value, 'dtype'): diff --git a/pandas/conftest.py b/pandas/conftest.py index c833e8abaeb9e..20f97bdec1107 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -545,7 +545,7 @@ def any_numpy_dtype(request): # categoricals are handled separately -_any_inferred_dtype = [ +_any_skipna_inferred_dtype = [ ('string', ['a', np.nan, 'c']), ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), @@ -570,11 +570,11 @@ def any_numpy_dtype(request): ('time', [time(1), np.nan, time(2)]), ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] -ids, _ = zip(*_any_inferred_dtype) # use inferred type as fixture-id +ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id -@pytest.fixture(params=_any_inferred_dtype, ids=ids) -def any_inferred_dtype(request): +@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) +def any_skipna_inferred_dtype(request): """ Fixture for all inferred dtypes from _libs.lib.infer_dtype @@ -610,10 +610,10 @@ def any_inferred_dtype(request): -------- >>> import pandas._libs.lib as lib >>> - >>> def test_something(any_inferred_dtype): - ... inferred_dtype, values = any_inferred_dtype + >>> def test_something(any_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype ... # will pass - ... assert lib.infer_dtype(values) == inferred_dtype + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1c74e4d7a3359..a91bec62503b1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -165,7 +165,7 @@ def _ensure_arraylike(values): ensure that we are arraylike if not already """ if not is_array_like(values): - inferred = lib.infer_dtype(values) + inferred = lib.infer_dtype(values, skipna=True) if inferred in ['mixed', 'string', 'unicode']: if isinstance(values, tuple): values = list(values) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a92e2f6157b40..d881b197f8414 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1482,7 +1482,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False - if lib.infer_dtype(data) == 'integer': + if lib.infer_dtype(data, skipna=True) == 'integer': data = data.astype(np.int64) else: # data comes back here as either i8 to denote UTC timestamps diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 86a57a2864bde..2084eef84560b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -170,7 +170,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): values = np.array(values, copy=copy) if is_object_dtype(values): - inferred_type = lib.infer_dtype(values) + inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == 'empty': values = np.empty(len(values)) values.fill(np.nan) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 9b7e1986e4831..623917b437c57 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -492,7 +492,7 @@ def __floordiv__(self, other): elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) - if lib.infer_dtype(result) == 'timedelta': + if lib.infer_dtype(result, skipna=True) == 'timedelta': result, _ = sequence_to_td64ns(result) return type(self)(result) return result diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 57c3f26a67c7e..f3bc67d83f89e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -73,7 +73,8 @@ def trans(x): if isinstance(dtype, string_types): if dtype == 'infer': - inferred_type = lib.infer_dtype(ensure_object(result.ravel())) + inferred_type = lib.infer_dtype(ensure_object(result.ravel()), + skipna=True) if inferred_type == 'boolean': dtype = 'bool' elif inferred_type == 'integer': @@ -458,7 +459,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False): return arr.dtype, np.asarray(arr) # don't force numpy coerce with nan's - inferred = lib.infer_dtype(arr) + inferred = lib.infer_dtype(arr, skipna=True) if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return (np.object_, arr) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e1141c6b6b3a8..3f3d539ee1e1f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -704,7 +704,8 @@ def is_datetime_arraylike(arr): if isinstance(arr, ABCDatetimeIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' + return (arr.dtype == object + and lib.infer_dtype(arr, skipna=True) == 'datetime') return getattr(arr, 'inferred_type', None) == 'datetime' diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 809dcbd054ea0..e1fd1f95649d0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -470,7 +470,7 @@ def _infer_fill_value(val): if is_datetimelike(val): return np.array('NaT', dtype=val.dtype) elif is_object_dtype(val.dtype): - dtype = lib.infer_dtype(ensure_object(val)) + dtype = lib.infer_dtype(ensure_object(val), skipna=True) if dtype in ['datetime', 'datetime64']: return np.array('NaT', dtype=_NS_DTYPE) elif dtype in ['timedelta', 'timedelta64']: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 13e851293c7e2..885e8dd955fbf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -341,7 +341,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # should not be coerced # GH 11836 if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data) + inferred = lib.infer_dtype(data, skipna=True) if inferred == 'integer': data = maybe_cast_to_integer_array(data, dtype, copy=copy) @@ -371,7 +371,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, else: data = data.astype(dtype) elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data) + inferred = lib.infer_dtype(data, skipna=True) if inferred == 'string': pass else: @@ -1731,7 +1731,7 @@ def inferred_type(self): """ Return a string of the type inferred from the values. """ - return lib.infer_dtype(self) + return lib.infer_dtype(self, skipna=True) @cache_readonly def is_all_dates(self): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5e26a3c6c439e..600f25f618a86 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2233,7 +2233,8 @@ def _partial_tup_index(self, tup, side='left'): section = labs[start:end] if lab not in lev: - if not lev.is_type_compatible(lib.infer_dtype([lab])): + if not lev.is_type_compatible(lib.infer_dtype([lab], + skipna=True)): raise TypeError('Level type mismatch: %s' % lab) # short circuit diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c437456794f43..da7138a9204bf 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -658,7 +658,7 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and dtype != 'object': - inferred = lib.infer_dtype(subarr) + inferred = lib.infer_dtype(subarr, skipna=True) if inferred == 'period': try: subarr = period_array(subarr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c0c016f9a8caa..c7f0346f9b21c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -942,7 +942,8 @@ def _maybe_coerce_merge_keys(self): 'representation', UserWarning) # let's infer and see if we are ok - elif lib.infer_dtype(lk) == lib.infer_dtype(rk): + elif (lib.infer_dtype(lk, skipna=True) + == lib.infer_dtype(rk, skipna=True)): pass # Check if we are trying to merge on obviously diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 5d5f6cf8102be..adf4f15ca5a76 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -412,7 +412,7 @@ def _convert_bin_to_numeric_type(bins, dtype): ------ ValueError if bins are not of a compat dtype to dtype """ - bins_dtype = infer_dtype(bins) + bins_dtype = infer_dtype(bins, skipna=True) if is_timedelta64_dtype(dtype): if bins_dtype in ['timedelta', 'timedelta64']: bins = to_timedelta(bins).view(np.int64) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6d951a7a5228a..806e6bd58bed7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -869,7 +869,7 @@ def _get_with(self, key): if isinstance(key, Index): key_type = key.inferred_type else: - key_type = lib.infer_dtype(key) + key_type = lib.infer_dtype(key, skipna=True) if key_type == 'integer': if self.index.is_integer() or self.index.is_floating(): @@ -1006,7 +1006,7 @@ def _set_with(self, key, value): if isinstance(key, Index): key_type = key.inferred_type else: - key_type = lib.infer_dtype(key) + key_type = lib.infer_dtype(key, skipna=True) if key_type == 'integer': if self.index.inferred_type == 'integer': diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b34dfddcc66e1..308f60d826350 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -454,7 +454,7 @@ def sort_mixed(values): return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None - if PY3 and lib.infer_dtype(values) == 'mixed-integer': + if PY3 and lib.infer_dtype(values, skipna=True) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aadca1fcb3bef..6b90b32d2123f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1299,7 +1299,7 @@ def _validate_usecols_arg(usecols): elif not is_list_like(usecols): raise ValueError(msg) else: - usecols_dtype = lib.infer_dtype(usecols) + usecols_dtype = lib.infer_dtype(usecols, skipna=True) if usecols_dtype not in ('empty', 'integer', 'string', 'unicode'): raise ValueError(msg) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8132c458ce852..1612ee536b238 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1948,7 +1948,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, return self.set_atom_complex(block) dtype = block.dtype.name - inferred_type = lib.infer_dtype(block.values) + inferred_type = lib.infer_dtype(block.values, skipna=True) if inferred_type == 'date': raise TypeError( @@ -1994,7 +1994,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, data = block.values # see if we have a valid string type - inferred_type = lib.infer_dtype(data.ravel()) + inferred_type = lib.infer_dtype(data.ravel(), skipna=True) if inferred_type != 'string': # we cannot serialize this data, so report an exception on a column @@ -2002,7 +2002,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, for i, item in enumerate(block_items): col = block.iget(i) - inferred_type = lib.infer_dtype(col.ravel()) + inferred_type = lib.infer_dtype(col.ravel(), skipna=True) if inferred_type != 'string': raise TypeError( "Cannot serialize the column [%s] because\n" @@ -2739,7 +2739,7 @@ def write_array(self, key, value, items=None): # infer the type, warn if we have a non-string type here (for # performance) - inferred_type = lib.infer_dtype(value.ravel()) + inferred_type = lib.infer_dtype(value.ravel(), skipna=True) if empty_array: pass elif inferred_type == 'string': @@ -4506,7 +4506,7 @@ def _convert_index(index, encoding=None, errors='strict', format_type=None): if isinstance(index, MultiIndex): raise TypeError('MultiIndex not supported here!') - inferred_type = lib.infer_dtype(index) + inferred_type = lib.infer_dtype(index, skipna=True) values = np.asarray(index) @@ -4739,7 +4739,7 @@ def __init__(self, table, where=None, start=None, stop=None): # see if we have a passed coordinate like try: - inferred = lib.infer_dtype(where) + inferred = lib.infer_dtype(where, skipna=True) if inferred == 'integer' or inferred == 'boolean': where = np.asarray(where) if where.dtype == np.bool_: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index eeccd51a79d88..a4ab78d180ea4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -827,7 +827,7 @@ def _sqlalchemy_type(self, col): # Infer type of column, while ignoring missing values. # Needed for inserting typed data containing NULLs, GH 8778. - col_type = lib.infer_dtype(col) + col_type = lib.infer_dtype(col, skipna=True) from sqlalchemy.types import (BigInteger, Integer, Float, Text, Boolean, @@ -1315,7 +1315,7 @@ def _sql_type_name(self, col): # Infer type of column, while ignoring missing values. # Needed for inserting typed data containing NULLs, GH 8778. - col_type = lib.infer_dtype(col) + col_type = lib.infer_dtype(col, skipna=True) if col_type == 'timedelta64': warnings.warn("the 'timedelta' type is not supported, and will be " diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0f3e129610d60..b99943541c5d9 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -394,7 +394,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): to_datetime(d['year'], format='%Y').astype(np.int64)) d['days'] = days // NS_PER_DAY - elif infer_dtype(dates) == 'datetime': + elif infer_dtype(dates, skipna=True) == 'datetime': if delta: delta = dates.values - stata_epoch f = lambda x: \ @@ -1865,7 +1865,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, if force_strl: return '%9s' if dtype.type == np.object_: - inferred_dtype = infer_dtype(column) + inferred_dtype = infer_dtype(column, skipna=True) if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Column `{col}` cannot be exported.\n\nOnly ' diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 5ccbffa3656a3..e1ab269c3c6df 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -315,11 +315,11 @@ def test_infer_dtype_bytes(self): # string array of bytes arr = np.array(list('abc'), dtype='S1') - assert lib.infer_dtype(arr) == compare + assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes arr = arr.astype(object) - assert lib.infer_dtype(arr) == compare + assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes with missing values assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare @@ -496,58 +496,65 @@ class TestTypeInference(object): class Dummy(): pass - def test_inferred_dtype_fixture(self, any_inferred_dtype): + def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): # see pandas/conftest.py - inferred_dtype, values = any_inferred_dtype + inferred_dtype, values = any_skipna_inferred_dtype # make sure the inferred dtype of the fixture is as requested - assert inferred_dtype == lib.infer_dtype(values) + assert inferred_dtype == lib.infer_dtype(values, skipna=True) def test_length_zero(self): - result = lib.infer_dtype(np.array([], dtype='i4')) + result = lib.infer_dtype(np.array([], dtype='i4'), skipna=True) assert result == 'integer' - result = lib.infer_dtype([]) + result = lib.infer_dtype([], skipna=True) assert result == 'empty' # GH 18004 arr = np.array([np.array([], dtype=object), np.array([], dtype=object)]) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'empty' def test_integers(self): arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' + def test_warn(self): + arr = np.array([1, 2, 3], dtype=object) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = lib.infer_dtype(arr) # default: skipna=None -> warn + assert result == 'integer' + def test_bools(self): arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([True, np.nan, False], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' result = lib.infer_dtype(arr, skipna=False) @@ -555,38 +562,38 @@ def test_bools(self): def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([1.0, 2.0, Decimal(3)]) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' def test_string(self): @@ -598,7 +605,7 @@ def test_unicode(self): assert result == 'mixed' arr = [u'a', np.nan, u'c'] - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) expected = 'unicode' if PY2 else 'string' assert result == expected @@ -628,34 +635,34 @@ def test_infer_dtype_datetime(self): arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([np.datetime64('2011-01-01'), np.datetime64('2011-01-01')], dtype=object) - assert lib.infer_dtype(arr) == 'datetime64' + assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02')]) - assert lib.infer_dtype(arr) == 'datetime64' + assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1)]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, pd.Timestamp('2011-01-02'), n]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02'), n]) - assert lib.infer_dtype(arr) == 'datetime64' + assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1), n]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' # different type of nat arr = np.array([np.timedelta64('nat'), @@ -669,58 +676,58 @@ def test_infer_dtype_datetime(self): # mixed datetime arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' # should be datetime? arr = np.array([np.datetime64('2011-01-01'), pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([pd.Timestamp('2011-01-02'), np.datetime64('2011-01-01')]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) - assert lib.infer_dtype(arr) == 'mixed-integer' + assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' def test_infer_dtype_timedelta(self): arr = np.array([pd.Timedelta('1 days'), pd.Timedelta('2 days')]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D')], dtype=object) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([timedelta(1), timedelta(2)]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, Timedelta('1 days')]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D')]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1)]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, pd.Timedelta('1 days'), n]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D'), n]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1), n]) - assert lib.infer_dtype(arr) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # different type of nat arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], @@ -735,19 +742,19 @@ def test_infer_dtype_period(self): # GH 13664 arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='D')]) - assert lib.infer_dtype(arr) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='M')]) - assert lib.infer_dtype(arr) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Period('2011-01', freq='D')]) - assert lib.infer_dtype(arr) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' arr = np.array([n, pd.Period('2011-01', freq='D'), n]) - assert lib.infer_dtype(arr) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' # different type of nat arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], @@ -826,15 +833,15 @@ def test_infer_datetimelike_array_nan_nat_like(self, first, second, def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) - assert lib.infer_dtype(arr) == 'floating' + assert lib.infer_dtype(arr, skipna=True) == 'floating' # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) - assert lib.infer_dtype(arr) == 'empty' + assert lib.infer_dtype(arr, skipna=True) == 'empty' assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([None, np.nan, np.nan]) - assert lib.infer_dtype(arr) == 'empty' + assert lib.infer_dtype(arr, skipna=True) == 'empty' assert lib.infer_dtype(arr, skipna=False) == 'mixed' # pd.NaT @@ -948,7 +955,7 @@ def test_date(self): result = lib.infer_dtype(dates, skipna=False) assert result == 'mixed' - result = lib.infer_dtype(dates) + result = lib.infer_dtype(dates, skipna=True) assert result == 'date' def test_is_numeric_array(self): @@ -991,7 +998,7 @@ def test_object(self): arr = np.array([None], dtype='O') result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'empty' def test_to_object_array_width(self): @@ -1023,17 +1030,17 @@ def test_categorical(self): # GH 8974 from pandas import Categorical, Series arr = Categorical(list('abc')) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'categorical' - result = lib.infer_dtype(Series(arr)) + result = lib.infer_dtype(Series(arr), skipna=True) assert result == 'categorical' arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr) + result = lib.infer_dtype(arr, skipna=True) assert result == 'categorical' - result = lib.infer_dtype(Series(arr)) + result = lib.infer_dtype(Series(arr), skipna=True) assert result == 'categorical' diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index e93bfe3e763a4..94453b336bf9e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -119,7 +119,7 @@ def any_string_method(request): # subset of the full set from pandas/conftest.py -_any_allowed_inferred_dtype = [ +_any_allowed_skipna_inferred_dtype = [ ('string', ['a', np.nan, 'c']), ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), @@ -127,11 +127,11 @@ def any_string_method(request): ('empty', []), ('mixed-integer', ['a', np.nan, 2]) ] -ids, _ = zip(*_any_allowed_inferred_dtype) # use inferred type as id +ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id -@pytest.fixture(params=_any_allowed_inferred_dtype, ids=ids) -def any_allowed_inferred_dtype(request): +@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) +def any_allowed_skipna_inferred_dtype(request): """ Fixture for all (inferred) dtypes allowed in StringMethods.__init__ @@ -155,10 +155,10 @@ def any_allowed_inferred_dtype(request): -------- >>> import pandas._libs.lib as lib >>> - >>> def test_something(any_allowed_inferred_dtype): - ... inferred_dtype, values = any_allowed_inferred_dtype + >>> def test_something(any_allowed_skipna_inferred_dtype): + ... inferred_dtype, values = any_allowed_skipna_inferred_dtype ... # will pass - ... assert lib.infer_dtype(values) == inferred_dtype + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -177,9 +177,9 @@ def test_api(self): @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_dtype(self, box, dtype, any_inferred_dtype): + def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): # one instance of parametrized fixture - inferred_dtype, values = any_inferred_dtype + inferred_dtype, values = any_skipna_inferred_dtype t = box(values, dtype=dtype) # explicit dtype to avoid casting @@ -217,14 +217,15 @@ def test_api_per_dtype(self, box, dtype, any_inferred_dtype): @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_method(self, box, dtype, any_allowed_inferred_dtype, + def test_api_per_method(self, box, dtype, + any_allowed_skipna_inferred_dtype, any_string_method): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others # one instance of each parametrized fixture - inferred_dtype, values = any_allowed_inferred_dtype + inferred_dtype, values = any_allowed_skipna_inferred_dtype method_name, args, kwargs = any_string_method # TODO: get rid of these xfails From 568bfd9fcc1b0f01da62b73248de84dc318f579c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 6 Dec 2018 07:46:56 +0100 Subject: [PATCH 03/12] Missed one --- pandas/tests/series/test_constructors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f5a445e2cca9a..e1732d106b665 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -806,12 +806,12 @@ def test_constructor_with_datetime_tz(self): s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) assert s.dtype == 'datetime64[ns, US/Pacific]' - assert lib.infer_dtype(s) == 'datetime64' + assert lib.infer_dtype(s, skipna=True) == 'datetime64' s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) assert s.dtype == 'object' - assert lib.infer_dtype(s) == 'datetime' + assert lib.infer_dtype(s, skipna=True) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') From 8c3c5b43ba18e1ec4cab89e67b4a334ff8a5537a Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 15 Dec 2018 00:50:14 +0100 Subject: [PATCH 04/12] Review (jreback) --- pandas/_libs/lib.pyx | 2 +- pandas/core/algorithms.py | 8 ++++++-- pandas/core/dtypes/cast.py | 5 +++-- pandas/core/dtypes/common.py | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 155cb4f13b0f7..2e7acdc15e0bb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1087,7 +1087,7 @@ def infer_dtype(value: object, skipna: object=None) -> str: Parameters ---------- value : scalar, list, ndarray, or pandas type - skipna : bool, default None + skipna : bool, default False Ignore NaN values when inferring the type. .. versionadded:: 0.21.0 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2fc7c7254ed65..88d81115e609b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -202,7 +202,9 @@ def _get_hashtable_algo(values): if ndtype == 'object': - # its cheaper to use a String Hash Table than Object + # it's cheaper to use a String Hash Table than Object; we infer + # including nulls because that is the only difference between + # StringHashTable and ObjectHashtable if lib.infer_dtype(values, skipna=False) in ['string']: ndtype = 'string' else: @@ -220,7 +222,9 @@ def _get_data_algo(values, func_map): values, dtype, ndtype = _ensure_data(values) if ndtype == 'object': - # its cheaper to use a String Hash Table than Object + # it's cheaper to use a String Hash Table than Object; we infer + # including nulls because that is the only difference between + # StringHashTable and ObjectHashtable if lib.infer_dtype(values, skipna=False) in ['string']: ndtype = 'string' diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f3bc67d83f89e..2715105dae02e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -938,10 +938,11 @@ def try_timedelta(v): # We have at least a NaT and a string # try timedelta first to avoid spurious datetime conversions - # e.g. '00:00:01' is a timedelta but - # technically is also a datetime + # e.g. '00:00:01' is a timedelta but technically is also a datetime value = try_timedelta(v) if lib.infer_dtype(value, skipna=False) in ['mixed']: + # cannot skip missing values, as NaT implies that the string + # is actually a datetime value = try_datetime(v) return value diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3f3d539ee1e1f..12d39758b630b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -704,7 +704,7 @@ def is_datetime_arraylike(arr): if isinstance(arr, ABCDatetimeIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): - return (arr.dtype == object + return (is_object_dtype(arr.dtype) and lib.infer_dtype(arr, skipna=True) == 'datetime') return getattr(arr, 'inferred_type', None) == 'datetime' From 52616b0566c275be20ffe3dd2d6d0985119d62da Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 19 Dec 2018 23:51:13 +0100 Subject: [PATCH 05/12] Fix new occurrence of infer_dtype --- pandas/plotting/_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 3a3ebe7c56f67..9aa79518a2c22 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -246,7 +246,7 @@ def _convert_1d(values, units, axis): return values.asfreq(axis.freq)._ndarray_values elif isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values) == 'period': + elif lib.infer_dtype(values, skipna=False) == 'period': # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex return PeriodIndex(values, freq=axis.freq)._ndarray_values From 245b7bc2d4615c0b5e5684d32ded4f09a1f79e0b Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 21 Dec 2018 11:46:34 +0100 Subject: [PATCH 06/12] Fix overlooked merge artefact --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4a6cda011bbce..5f52485e2aded 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1145,7 +1145,7 @@ Deprecations - :func:`pandas.api.types.is_period` is deprecated in favor of `pandas.api.types.is_period_dtype` (:issue:`23917`) - :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`) - Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`) -- Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). +- Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). - The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`) - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) From 72ba292d631af679ec25dad9b491010b6a77ff4a Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 3 Jan 2019 10:55:37 +0100 Subject: [PATCH 07/12] Oversight --- pandas/io/stata.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b423835871765..e1ebc427ac46f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -396,11 +396,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): to_datetime(d['year'], format='%Y').astype(np.int64)) d['days'] = days // NS_PER_DAY -<<<<<<< HEAD elif infer_dtype(dates, skipna=True) == 'datetime': -======= - elif infer_dtype(dates, skipna=False) == 'datetime': ->>>>>>> upstream/master if delta: delta = dates.values - stata_epoch f = lambda x: \ From ebd106382b3ad4e8016a446ca2e59f989d2f23fd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 3 Jan 2019 11:27:39 +0100 Subject: [PATCH 08/12] Add kwarg to new call-sites --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c52728457277b..5b22ebb7b469d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self): # object values are allowed to be merged elif ((lk_is_object and is_numeric_dtype(rk)) or (is_numeric_dtype(lk) and rk_is_object)): - inferred_left = lib.infer_dtype(lk) - inferred_right = lib.infer_dtype(rk) + inferred_left = lib.infer_dtype(lk, skipna=False) + inferred_right = lib.infer_dtype(rk, skipna=False) bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] From e287bd84d8340a8e924e73a942823be9f0d2e262 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 3 Jan 2019 11:56:08 +0100 Subject: [PATCH 09/12] and another new call-site --- pandas/core/arrays/array_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 4e84c62bce3d6..d17e508e3ca90 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -209,7 +209,7 @@ def array(data, # type: Sequence[object] return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data) + inferred_dtype = lib.infer_dtype(data, skipna=False) if inferred_dtype == 'period': try: return period_array(data, copy=copy) From 1ea21fef39c46ecdc628e1be0a6150f05d1c9693 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 3 Jan 2019 12:00:33 +0100 Subject: [PATCH 10/12] Try skipna=True for new call-sites --- pandas/core/arrays/array_.py | 2 +- pandas/core/reshape/merge.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index d17e508e3ca90..d037c7d69f964 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -209,7 +209,7 @@ def array(data, # type: Sequence[object] return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=False) + inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == 'period': try: return period_array(data, copy=copy) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5b22ebb7b469d..b4f650eb7b971 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self): # object values are allowed to be merged elif ((lk_is_object and is_numeric_dtype(rk)) or (is_numeric_dtype(lk) and rk_is_object)): - inferred_left = lib.infer_dtype(lk, skipna=False) - inferred_right = lib.infer_dtype(rk, skipna=False) + inferred_left = lib.infer_dtype(lk, skipna=True) + inferred_right = lib.infer_dtype(rk, skipna=True) bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] From fd8c0101f5ec07a302326f7c9bc2b29ddf5644cf Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 3 Jan 2019 12:41:15 +0100 Subject: [PATCH 11/12] Revert skipna=True where it was added --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/array_.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/dtypes/cast.py | 4 ++-- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/missing.py | 2 +- pandas/core/indexes/base.py | 6 +++--- pandas/core/indexes/multi.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/merge.py | 8 ++++---- pandas/core/reshape/tile.py | 2 +- pandas/core/series.py | 4 ++-- pandas/core/sorting.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/pytables.py | 12 ++++++------ pandas/io/stata.py | 2 +- 17 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dfa52eebe6dc0..b473a7aef929e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -165,7 +165,7 @@ def _ensure_arraylike(values): ensure that we are arraylike if not already """ if not is_array_like(values): - inferred = lib.infer_dtype(values, skipna=True) + inferred = lib.infer_dtype(values, skipna=False) if inferred in ['mixed', 'string', 'unicode']: if isinstance(values, tuple): values = list(values) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index d037c7d69f964..d17e508e3ca90 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -209,7 +209,7 @@ def array(data, # type: Sequence[object] return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=True) + inferred_dtype = lib.infer_dtype(data, skipna=False) if inferred_dtype == 'period': try: return period_array(data, copy=copy) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d40bf87d4d3db..c428fd2e75e08 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1678,7 +1678,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False - if lib.infer_dtype(data, skipna=True) == 'integer': + if lib.infer_dtype(data, skipna=False) == 'integer': data = data.astype(np.int64) else: # data comes back here as either i8 to denote UTC timestamps diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index fcb78f54010ed..3677d041886b3 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -580,7 +580,7 @@ def __floordiv__(self, other): elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) - if lib.infer_dtype(result, skipna=True) == 'timedelta': + if lib.infer_dtype(result, skipna=False) == 'timedelta': result, _ = sequence_to_td64ns(result) return type(self)(result) return result diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ff56763fd1149..b55bad46580fe 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -76,7 +76,7 @@ def trans(x): if isinstance(dtype, string_types): if dtype == 'infer': inferred_type = lib.infer_dtype(ensure_object(result.ravel()), - skipna=True) + skipna=False) if inferred_type == 'boolean': dtype = 'bool' elif inferred_type == 'integer': @@ -461,7 +461,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False): return arr.dtype, np.asarray(arr) # don't force numpy coerce with nan's - inferred = lib.infer_dtype(arr, skipna=True) + inferred = lib.infer_dtype(arr, skipna=False) if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return (np.object_, arr) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 8ac7c49637009..b4c769fab88ad 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -704,7 +704,7 @@ def is_datetime_arraylike(arr): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return (is_object_dtype(arr.dtype) - and lib.infer_dtype(arr, skipna=True) == 'datetime') + and lib.infer_dtype(arr, skipna=False) == 'datetime') return getattr(arr, 'inferred_type', None) == 'datetime' diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index b371c91ef4cfd..b22cb1050f140 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -474,7 +474,7 @@ def _infer_fill_value(val): if is_datetimelike(val): return np.array('NaT', dtype=val.dtype) elif is_object_dtype(val.dtype): - dtype = lib.infer_dtype(ensure_object(val), skipna=True) + dtype = lib.infer_dtype(ensure_object(val), skipna=False) if dtype in ['datetime', 'datetime64']: return np.array('NaT', dtype=_NS_DTYPE) elif dtype in ['timedelta', 'timedelta64']: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bbc97b9824a8b..a7f2d4fad38de 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -346,7 +346,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # should not be coerced # GH 11836 if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=True) + inferred = lib.infer_dtype(data, skipna=False) if inferred == 'integer': data = maybe_cast_to_integer_array(data, dtype, copy=copy) @@ -376,7 +376,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, else: data = data.astype(dtype) elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=True) + inferred = lib.infer_dtype(data, skipna=False) if inferred == 'string': pass else: @@ -1718,7 +1718,7 @@ def inferred_type(self): """ Return a string of the type inferred from the values. """ - return lib.infer_dtype(self, skipna=True) + return lib.infer_dtype(self, skipna=False) @cache_readonly def is_all_dates(self): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 69281f5eedc5b..8d26080a0361d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2319,7 +2319,7 @@ def _partial_tup_index(self, tup, side='left'): if lab not in lev: if not lev.is_type_compatible(lib.infer_dtype([lab], - skipna=True)): + skipna=False)): raise TypeError('Level type mismatch: %s' % lab) # short circuit diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index dfcfff1475271..62e7f64518bcc 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -667,7 +667,7 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and dtype != 'object': - inferred = lib.infer_dtype(subarr, skipna=True) + inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: subarr = period_array(subarr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b4f650eb7b971..7861a122afdb6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -947,8 +947,8 @@ def _maybe_coerce_merge_keys(self): continue # let's infer and see if we are ok - elif (lib.infer_dtype(lk, skipna=True) - == lib.infer_dtype(rk, skipna=True)): + elif (lib.infer_dtype(lk, skipna=False) + == lib.infer_dtype(rk, skipna=False)): continue # Check if we are trying to merge on obviously @@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self): # object values are allowed to be merged elif ((lk_is_object and is_numeric_dtype(rk)) or (is_numeric_dtype(lk) and rk_is_object)): - inferred_left = lib.infer_dtype(lk, skipna=True) - inferred_right = lib.infer_dtype(rk, skipna=True) + inferred_left = lib.infer_dtype(lk, skipna=False) + inferred_right = lib.infer_dtype(rk, skipna=False) bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index bb692fb9a3c81..6f95b14993228 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -416,7 +416,7 @@ def _convert_bin_to_numeric_type(bins, dtype): ------ ValueError if bins are not of a compat dtype to dtype """ - bins_dtype = infer_dtype(bins, skipna=True) + bins_dtype = infer_dtype(bins, skipna=False) if is_timedelta64_dtype(dtype): if bins_dtype in ['timedelta', 'timedelta64']: bins = to_timedelta(bins).view(np.int64) diff --git a/pandas/core/series.py b/pandas/core/series.py index ed585240b9741..52b60339a7d68 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -875,7 +875,7 @@ def _get_with(self, key): if isinstance(key, Index): key_type = key.inferred_type else: - key_type = lib.infer_dtype(key, skipna=True) + key_type = lib.infer_dtype(key, skipna=False) if key_type == 'integer': if self.index.is_integer() or self.index.is_floating(): @@ -1012,7 +1012,7 @@ def _set_with(self, key, value): if isinstance(key, Index): key_type = key.inferred_type else: - key_type = lib.infer_dtype(key, skipna=True) + key_type = lib.infer_dtype(key, skipna=False) if key_type == 'integer': if self.index.inferred_type == 'integer': diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 308f60d826350..ef69939d6e978 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -454,7 +454,7 @@ def sort_mixed(values): return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None - if PY3 and lib.infer_dtype(values, skipna=True) == 'mixed-integer': + if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c0b30c852af2d..5590e8f445c67 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1300,7 +1300,7 @@ def _validate_usecols_arg(usecols): elif not is_list_like(usecols): raise ValueError(msg) else: - usecols_dtype = lib.infer_dtype(usecols, skipna=True) + usecols_dtype = lib.infer_dtype(usecols, skipna=False) if usecols_dtype not in ('empty', 'integer', 'string', 'unicode'): raise ValueError(msg) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 516e3eb2d1430..b115529f696b8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1952,7 +1952,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, return self.set_atom_complex(block) dtype = block.dtype.name - inferred_type = lib.infer_dtype(block.values, skipna=True) + inferred_type = lib.infer_dtype(block.values, skipna=False) if inferred_type == 'date': raise TypeError( @@ -1998,7 +1998,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, data = block.values # see if we have a valid string type - inferred_type = lib.infer_dtype(data.ravel(), skipna=True) + inferred_type = lib.infer_dtype(data.ravel(), skipna=False) if inferred_type != 'string': # we cannot serialize this data, so report an exception on a column @@ -2006,7 +2006,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, for i, item in enumerate(block_items): col = block.iget(i) - inferred_type = lib.infer_dtype(col.ravel(), skipna=True) + inferred_type = lib.infer_dtype(col.ravel(), skipna=False) if inferred_type != 'string': raise TypeError( "Cannot serialize the column [%s] because\n" @@ -2745,7 +2745,7 @@ def write_array(self, key, value, items=None): # infer the type, warn if we have a non-string type here (for # performance) - inferred_type = lib.infer_dtype(value.ravel(), skipna=True) + inferred_type = lib.infer_dtype(value.ravel(), skipna=False) if empty_array: pass elif inferred_type == 'string': @@ -4512,7 +4512,7 @@ def _convert_index(index, encoding=None, errors='strict', format_type=None): if isinstance(index, MultiIndex): raise TypeError('MultiIndex not supported here!') - inferred_type = lib.infer_dtype(index, skipna=True) + inferred_type = lib.infer_dtype(index, skipna=False) values = np.asarray(index) @@ -4745,7 +4745,7 @@ def __init__(self, table, where=None, start=None, stop=None): # see if we have a passed coordinate like try: - inferred = lib.infer_dtype(where, skipna=True) + inferred = lib.infer_dtype(where, skipna=False) if inferred == 'integer' or inferred == 'boolean': where = np.asarray(where) if where.dtype == np.bool_: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e1ebc427ac46f..aad57fc489fb6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -396,7 +396,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): to_datetime(d['year'], format='%Y').astype(np.int64)) d['days'] = days // NS_PER_DAY - elif infer_dtype(dates, skipna=True) == 'datetime': + elif infer_dtype(dates, skipna=False) == 'datetime': if delta: delta = dates.values - stata_epoch f = lambda x: \ From 1edd0e91b249509eeb6cf6219f0a8d0563e3e1dd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 4 Jan 2019 12:13:14 +0100 Subject: [PATCH 12/12] Review (jreback) --- pandas/tests/dtypes/test_inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d8cbb7a895e13..f58cb362cd6d2 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -549,7 +549,8 @@ def test_integers(self): result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' - def test_warn(self): + def test_deprecation(self): + # GH 24050 arr = np.array([1, 2, 3], dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):