From cb8f6c6da9d390ef928390da621b716e2cbf2b1f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Mar 2020 11:56:10 -0700 Subject: [PATCH 01/29] REF: reshape.concat operate on arrays, not SingleBlockManagers --- pandas/core/arrays/numpy_.py | 5 +++++ pandas/core/internals/managers.py | 25 ------------------------- pandas/core/reshape/concat.py | 12 +++++++----- pandas/tests/extension/test_numpy.py | 17 ++++++++++++----- 4 files changed, 24 insertions(+), 35 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 3058e1d6073f3..e46125a87bf8c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -436,6 +436,11 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods + def astype(self, dtype, copy=True): + if dtype is self.dtype: + return self.copy() if copy else self + return super().astype(dtype, copy=copy) + def to_numpy( self, dtype=None, copy: bool = False, na_value=lib.no_default ) -> np.ndarray: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9630abf61f692..5e05398f0e917 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1633,31 +1633,6 @@ def fast_xs(self, loc): """ raise NotImplementedError("Use series._values[loc] instead") - def concat(self, to_concat, new_axis: Index) -> "SingleBlockManager": - """ - Concatenate a list of SingleBlockManagers into a single - SingleBlockManager. - - Used for pd.concat of Series objects with axis=0. - - Parameters - ---------- - to_concat : list of SingleBlockManagers - new_axis : Index of the result - - Returns - ------- - SingleBlockManager - """ - - blocks = [obj.blocks[0] for obj in to_concat] - values = concat_compat([x.values for x in blocks]) - - new_block = make_block(values, placement=slice(0, len(values), 1)) - - mgr = SingleBlockManager(new_block, new_axis) - return mgr - # -------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index b4497ce1780e6..59c39128dcd14 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -9,6 +9,7 @@ from pandas._typing import FrameOrSeriesUnion, Label +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas import DataFrame, Index, MultiIndex, Series @@ -456,12 +457,13 @@ def get_result(self): # stack blocks if self.axis == 0: name = com.consensus_name_attr(self.objs) - - mgr = self.objs[0]._data.concat( - [x._data for x in self.objs], self.new_axes - ) cons = self.objs[0]._constructor - return cons(mgr, name=name).__finalize__(self, method="concat") + + arrs = [ser._values for ser in self.objs] + + res = concat_compat(arrs, axis=0) + result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype) + return result.__finalize__(self, method="concat") # combine as columns in a frame else: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 61c5925383f88..4649b7619e1c6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -130,7 +130,18 @@ def skip_numpy_object(dtype): class BaseNumPyTests: - pass + @classmethod + def assert_series_equal(cls, left, right, *args, **kwargs): + # FIXME: kludge because we are patching is_extension_array_dtype + # with monkeypatch, needed for test_loc_iloc_frame_single_dtype + # in the object-dtype case + ld = left.dtype + rd = right.dtype + if isinstance(ld, PandasDtype) and ld.numpy_dtype == object: + if isinstance(rd, np.dtype) and rd == object: + # Call these close enough + left = left.astype(rd) + tm.assert_series_equal(left, right, *args, **kwargs) class TestCasting(BaseNumPyTests, base.BaseCastingTests): @@ -170,10 +181,6 @@ def test_take_series(self, data): # ValueError: PandasArray must be 1-dimensional. super().test_take_series(data) - @pytest.mark.xfail(reason="astype doesn't recognize data.dtype") - def test_loc_iloc_frame_single_dtype(self, data): - super().test_loc_iloc_frame_single_dtype(data) - class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested From e008f4008850d60668737de64173d5cfc9ef765b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 30 Mar 2020 18:42:10 -0700 Subject: [PATCH 02/29] xfail more selectively --- pandas/tests/extension/test_numpy.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 4649b7619e1c6..12f7c60349ad2 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -130,18 +130,7 @@ def skip_numpy_object(dtype): class BaseNumPyTests: - @classmethod - def assert_series_equal(cls, left, right, *args, **kwargs): - # FIXME: kludge because we are patching is_extension_array_dtype - # with monkeypatch, needed for test_loc_iloc_frame_single_dtype - # in the object-dtype case - ld = left.dtype - rd = right.dtype - if isinstance(ld, PandasDtype) and ld.numpy_dtype == object: - if isinstance(rd, np.dtype) and rd == object: - # Call these close enough - left = left.astype(rd) - tm.assert_series_equal(left, right, *args, **kwargs) + pass class TestCasting(BaseNumPyTests, base.BaseCastingTests): @@ -181,6 +170,12 @@ def test_take_series(self, data): # ValueError: PandasArray must be 1-dimensional. super().test_take_series(data) + def test_loc_iloc_frame_single_dtype(self, data): + if data.dtype.numpy_dtype == object: + # GH#33125 + pytest.xfail(reason="astype doesn't recognize data.dtype") + super().test_loc_iloc_frame_single_dtype(data) + class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested From 3f0ee1b3dc1afb45bf0bd8682871c6cf5d9ac161 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 13:41:32 -0700 Subject: [PATCH 03/29] Revert PandasArray.astype patch --- pandas/core/arrays/numpy_.py | 5 ----- pandas/tests/extension/test_numpy.py | 25 ++++++++++++++++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e46125a87bf8c..3058e1d6073f3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -436,11 +436,6 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def astype(self, dtype, copy=True): - if dtype is self.dtype: - return self.copy() if copy else self - return super().astype(dtype, copy=copy) - def to_numpy( self, dtype=None, copy: bool = False, na_value=lib.no_default ) -> np.ndarray: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 12f7c60349ad2..aa5a99282131a 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -171,9 +171,10 @@ def test_take_series(self, data): super().test_take_series(data) def test_loc_iloc_frame_single_dtype(self, data): - if data.dtype.numpy_dtype == object: + npdtype = data.dtype.numpy_dtype + if npdtype == object or npdtype == np.float64: # GH#33125 - pytest.xfail(reason="astype doesn't recognize data.dtype") + pytest.xfail(reason="GH#33125 astype doesn't recognize data.dtype") super().test_loc_iloc_frame_single_dtype(data) @@ -181,6 +182,8 @@ class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): # ValueError: Names should be list-like for a MultiIndex + if data_for_grouping.dtype.numpy_dtype == np.float64: + pytest.xfail(reason="GH#33125 astype doesn't recognize data.dtype") super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) @@ -278,7 +281,11 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): class TestPrinting(BaseNumPyTests, base.BasePrintingTests): - pass + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_series_repr(self, data): + super().test_series_repr(data) @skip_nested @@ -323,6 +330,18 @@ class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): def test_concat_mixed_dtypes(self, data): super().test_concat_mixed_dtypes(data) + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_concat(self, data, in_frame): + super().test_concat(data, in_frame) + + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_concat_all_na_block(self, data_missing, in_frame): + super().test_concat_all_na_block(data_missing, in_frame) + @skip_nested def test_merge(self, data, na_value): # Fails creating expected From 2da47dec41a5f8fcb353bcd2302db32b9cb11b82 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 6 Apr 2020 23:42:00 +0300 Subject: [PATCH 04/29] DOC: Fix examples in `pandas/core/strings.py` (#33328) --- ci/code_checks.sh | 4 ++++ pandas/core/strings.py | 51 ++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cd9e4384fd0d9..1bdbbb54a0aac 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -296,6 +296,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/series.py RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests strings.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/strings.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + # Directories MSG='Doctests arrays'; echo $MSG diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 59b8b37f72695..52d9a81489db4 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -652,9 +652,9 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): To get the idea: >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo - 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz - 2 NaN + 0 oo + 1 uz + 2 NaN dtype: object Reverse every lowercase alphabetic word: @@ -2076,8 +2076,18 @@ class StringMethods(NoNewAttributesMixin): Examples -------- - >>> s.str.split('_') - >>> s.str.replace('_', '') + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object """ def __init__(self, data): @@ -2583,9 +2593,14 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Examples -------- - >>> s = pd.Series(["this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan]) + >>> s = pd.Series( + ... [ + ... "this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan + ... ] + ... ) + >>> s 0 this is a regular sentence 1 https://docs.python.org/3/tutorial/index.html 2 NaN @@ -2625,7 +2640,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): The `pat` parameter can be used to split by other characters. - >>> s.str.split(pat = "/") + >>> s.str.split(pat="/") 0 [this is a regular sentence] 1 [https:, , docs.python.org, 3, tutorial, index... 2 NaN @@ -2636,14 +2651,10 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): the columns during the split. >>> s.str.split(expand=True) - 0 1 2 3 - 0 this is a regular - 1 https://docs.python.org/3/tutorial/index.html None None None - 2 NaN NaN NaN NaN \ - 4 - 0 sentence - 1 None - 2 NaN + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN For slightly more complex use cases like splitting the html document name from a url, a combination of parameter settings can be used. @@ -2658,7 +2669,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): expressions. >>> s = pd.Series(["1+1=2"]) - + >>> s + 0 1+1=2 + dtype: object >>> s.str.split(r"\+|=", expand=True) 0 1 2 0 1 1 2 @@ -2750,7 +2763,7 @@ def rsplit(self, pat=None, n=-1, expand=False): >>> idx.str.partition() MultiIndex([('X', ' ', '123'), ('Y', ' ', '999')], - dtype='object') + ) Or an index with tuples with ``expand=False``: From 9585a4140370e75afa12616ec0d9b02276a6c4d0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Apr 2020 22:52:52 +0200 Subject: [PATCH 05/29] DOC: do not include type hints in signature in html docs (#33312) --- doc/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index 35833627f6c05..d24483abd28e1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -109,6 +109,7 @@ ) ) autosummary_generate = True if pattern is None else ["index"] +autodoc_typehints = "none" # numpydoc numpydoc_attributes_as_param_list = False From ed862c01fee1dd2d87de2dcf7f69eb7a4f2177aa Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 6 Apr 2020 23:15:18 +0200 Subject: [PATCH 06/29] BUG: DataFrame fail to construct when data is list and columns is nested list for MI (#32202) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/internals/construction.py | 114 ++++++++++++++++++++---- pandas/tests/frame/test_constructors.py | 26 ++++++ 3 files changed, 123 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7cb7db27ae603..2df732d67b5da 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -471,6 +471,7 @@ Other instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) +- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) - Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index fc7da4155db36..5c9e4b96047ee 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,12 +3,13 @@ constructors before passing them to a BlockManager. """ from collections import abc -from typing import Tuple +from typing import Dict, List, Optional, Tuple, Union import numpy as np import numpy.ma as ma from pandas._libs import lib +from pandas._typing import Axis, Dtype, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -522,7 +523,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) -def _list_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_to_arrays( + data: List[Scalar], + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -530,21 +536,25 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): content = list(lib.to_object_array(data).T) # gh-26429 do not raise user-facing AssertionError try: - result = _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) except AssertionError as e: raise ValueError(e) from e - return result + return result, columns -def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_series_to_arrays( + data: List, + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] columns = get_objs_combined_axis(pass_data, sort=False) - indexer_cache = {} + indexer_cache: Dict[int, Scalar] = {} aligned_values = [] for s in data: @@ -564,14 +574,19 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if values.dtype == np.object_: content = list(values.T) - return _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + return content, columns else: return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_dict_to_arrays( + data: List, + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -603,22 +618,85 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): data = [(type(d) is dict) and d or dict(d) for d in data] content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + return content, columns -def _convert_object_array(content, columns, coerce_float=False, dtype=None): +def _validate_or_indexify_columns( + content: List, columns: Union[Index, List, None] +) -> Union[Index, List[Axis]]: + """ + If columns is None, make numbers as column names; Otherwise, validate that + columns have valid length. + + Parameters + ---------- + content: list of data + columns: Iterable or None + + Returns + ------- + columns: If columns is Iterable, return as is; If columns is None, assign + positional column index value as columns. + + Raises + ------ + 1. AssertionError when content is not composed of list of lists, and if + length of columns is not equal to length of content. + 2. ValueError when content is list of lists, but length of each sub-list + is not equal + 3. ValueError when content is list of lists, but length of sub-list is + not equal to length of content + """ if columns is None: columns = ibase.default_index(len(content)) else: - if len(columns) != len(content): # pragma: no cover + + # Add mask for data which is composed of list of lists + is_mi_list = isinstance(columns, list) and all( + isinstance(col, list) for col in columns + ) + + if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( f"{len(columns)} columns passed, passed data had " f"{len(content)} columns" ) + elif is_mi_list: + + # check if nested list column, length of each sub-list should be equal + if len({len(col) for col in columns}) > 1: + raise ValueError( + "Length of columns passed for MultiIndex columns is different" + ) + + # if columns is not empty and length of sublist is not equal to content + elif columns and len(columns[0]) != len(content): + raise ValueError( + f"{len(columns[0])} columns passed, passed data had " + f"{len(content)} columns" + ) + return columns + + +def _convert_object_array( + content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None +) -> List[Scalar]: + """ + Internal function ot convert object array. + + Parameters + ---------- + content: list of processed data records + coerce_float: bool, to coerce floats or not, default is False + dtype: np.dtype, default is None + Returns + ------- + arrays: casted content if not object dtype, otherwise return as is in list. + """ # provide soft conversion of object dtypes def convert(arr): if dtype != object and dtype != np.object: @@ -628,7 +706,7 @@ def convert(arr): arrays = [convert(arr) for arr in content] - return arrays, columns + return arrays # --------------------------------------------------------------------- diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fc27f19490a9b..baac87755c6d2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1063,6 +1063,32 @@ def test_constructor_list_of_lists(self): result = DataFrame(data) tm.assert_frame_equal(result, expected) + def test_constructor_list_like_data_nested_list_column(self): + # GH 32173 + arrays = [list("abcd"), list("cdef")] + result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + + mi = MultiIndex.from_arrays(arrays) + expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi) + + tm.assert_frame_equal(result, expected) + + def test_constructor_wrong_length_nested_list_column(self): + # GH 32173 + arrays = [list("abc"), list("cde")] + + msg = "3 columns passed, passed data had 4" + with pytest.raises(ValueError, match=msg): + DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + + def test_constructor_unequal_length_nested_list_column(self): + # GH 32173 + arrays = [list("abcd"), list("cde")] + + msg = "Length of columns passed for MultiIndex columns is different" + with pytest.raises(ValueError, match=msg): + DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays) + def test_constructor_sequence_like(self): # GH 3783 # collections.Squence like From c57f6e7a4c8579f04505f0e2f82f9c5f2ae1ade3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 14:16:02 -0700 Subject: [PATCH 07/29] API/CLN: simplify CategoricalBlock.replace (#33279) --- pandas/core/internals/blocks.py | 15 +++------------ pandas/tests/frame/methods/test_replace.py | 12 +++++++++--- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d8b54fd5cffb3..fe58fd3af966c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2753,18 +2753,9 @@ def replace( ): inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() - if filter is None: # replace was called on a series - result.values.replace(to_replace, value, inplace=True) - if convert: - return result.convert(numeric=False, copy=not inplace) - else: - return result - else: # replace was called on a DataFrame - if not isna(value): - result.values.add_categories(value, inplace=True) - return super(CategoricalBlock, result).replace( - to_replace, value, inplace, filter, regex, convert - ) + + result.values.replace(to_replace, value, inplace=True) + return result # ----------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ee89562261b19..a9fb686d5bc50 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1303,9 +1303,15 @@ def test_replace_method(self, to_replace, method, expected): def test_categorical_replace_with_dict(self, replace_dict, final_data): # GH 26988 df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - expected = DataFrame(final_data, columns=["a", "b"], dtype="category") - expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) - expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[3, 2]) + + excat = [3, 2] if replace_dict["b"] == 1 else [1, 3] + b = pd.Categorical(final_data[:, 1], categories=excat) + + expected = DataFrame({"a": a, "b": b}) result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) with pytest.raises(AssertionError): From 2b322d2030e15ae7af298653fd5a8f53c612464e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 14:22:21 -0700 Subject: [PATCH 08/29] REF: BlockManager.delete -> idelete (#33332) --- pandas/core/generic.py | 3 ++- pandas/core/internals/managers.py | 15 ++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fac4ca6768ece..3363d22686f96 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3713,7 +3713,8 @@ def __delitem__(self, key) -> None: # If the above loop ran and didn't delete anything because # there was no match, this call should raise the appropriate # exception: - self._mgr.delete(key) + loc = self.axes[-1].get_loc(key) + self._mgr.idelete(loc) # delete from the caches try: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c6ce4aea9fa40..c98c21dfcc80e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1007,12 +1007,10 @@ def iget(self, i: int) -> "SingleBlockManager": self.axes[1], ) - def delete(self, item): + def idelete(self, indexer): """ - Delete selected item (items if non-unique) in-place. + Delete selected locations in-place (new block and array, same BlockManager) """ - indexer = self.items.get_loc(item) - is_deleted = np.zeros(self.shape[0], dtype=np.bool_) is_deleted[indexer] = True ref_loc_offset = -is_deleted.cumsum() @@ -1606,15 +1604,14 @@ def _consolidate_check(self): def _consolidate_inplace(self): pass - def delete(self, item): + def idelete(self, indexer): """ - Delete single item from SingleBlockManager. + Delete single location from SingleBlockManager. Ensures that self.blocks doesn't become empty. """ - loc = self.items.get_loc(item) - self._block.delete(loc) - self.axes[0] = self.axes[0].delete(loc) + self._block.delete(indexer) + self.axes[0] = self.axes[0].delete(indexer) def fast_xs(self, loc): """ From d4d75387b88a78658e53807c2b5860cfcd555687 Mon Sep 17 00:00:00 2001 From: rebecca-palmer Date: Mon, 6 Apr 2020 22:34:55 +0100 Subject: [PATCH 09/29] TST: Don't use 'is' on strings to avoid SyntaxWarning (#33322) --- pandas/tests/frame/test_alter_axes.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 961c18749f055..b28e8a5b347aa 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -234,9 +234,16 @@ def test_set_index_pass_arrays_duplicate( # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; - # use "is" because == would give ambiguous Boolean error for containers + # plain == would give ambiguous Boolean error for containers first_drop = ( - False if (keys[0] is "A" and keys[1] is "A") else drop # noqa: F632 + False + if ( + isinstance(keys[0], str) + and keys[0] == "A" + and isinstance(keys[1], str) + and keys[1] == "A" + ) + else drop ) # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise From e3eb29cf44fcd2e2da249d2872b45c4b2c0c4bd8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 14:39:20 -0700 Subject: [PATCH 10/29] CLN: remove fill_tuple kludge (#33310) --- pandas/core/internals/blocks.py | 13 ++++++------- pandas/core/internals/managers.py | 23 +++++++++-------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fe58fd3af966c..c23f78d845cfd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1241,7 +1241,7 @@ def func(x): blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) - def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_tuple=None): + def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default): """ Take values according to indexer and return them as a block.bb @@ -1252,11 +1252,10 @@ def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_tuple=None): values = self.values - if fill_tuple is None: + if fill_value is lib.no_default: fill_value = self.fill_value allow_fill = False else: - fill_value = fill_tuple[0] allow_fill = True new_values = algos.take_nd( @@ -1721,14 +1720,14 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) - def take_nd(self, indexer, axis: int = 0, new_mgr_locs=None, fill_tuple=None): + def take_nd( + self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default + ): """ Take values according to indexer and return them as a block. """ - if fill_tuple is None: + if fill_value is lib.no_default: fill_value = None - else: - fill_value = fill_tuple[0] # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c98c21dfcc80e..9191c2f0a0a76 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1297,14 +1297,14 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) + new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value) else: new_blocks = [ blk.take_nd( indexer, axis=axis, - fill_tuple=( - fill_value if fill_value is not None else blk.fill_value, + fill_value=( + fill_value if fill_value is not None else blk.fill_value ), ) for blk in self.blocks @@ -1315,7 +1315,7 @@ def reindex_indexer( return type(self).from_blocks(new_blocks, new_axes) - def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): """ Slice/take blocks along axis=0. @@ -1325,7 +1325,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): ------- new_blocks : list of Block """ - allow_fill = fill_tuple is not None + allow_fill = fill_value is not lib.no_default sl_type, slobj, sllen = _preprocess_slice_or_indexer( slice_or_indexer, self.shape[0], allow_fill=allow_fill @@ -1337,16 +1337,15 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): if sl_type in ("slice", "mask"): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: - if allow_fill and fill_tuple[0] is None: + if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) - fill_tuple = (fill_value,) return [ blk.take_nd( slobj, axis=0, new_mgr_locs=slice(0, sllen), - fill_tuple=fill_tuple, + fill_value=fill_value, ) ] @@ -1369,8 +1368,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): blocks = [] for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=True): if blkno == -1: - # If we've got here, fill_tuple was not None. - fill_value = fill_tuple[0] + # If we've got here, fill_value was not lib.no_default blocks.append( self._make_na_block(placement=mgr_locs, fill_value=fill_value) @@ -1391,10 +1389,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): else: blocks.append( blk.take_nd( - blklocs[mgr_locs.indexer], - axis=0, - new_mgr_locs=mgr_locs, - fill_tuple=None, + blklocs[mgr_locs.indexer], axis=0, new_mgr_locs=mgr_locs, ) ) From fcfa7c47a8471c3287d903f5fe79bd38beda37c7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 14:53:19 -0700 Subject: [PATCH 11/29] TST: misplaced reduction/indexing tests (#33307) --- pandas/tests/frame/test_analytics.py | 25 +++++++++++++++++++ pandas/tests/frame/test_timeseries.py | 23 ----------------- .../tests/indexes/datetimes/test_indexing.py | 7 ++++++ pandas/tests/series/indexing/test_datetime.py | 15 ----------- pandas/tests/series/test_reductions.py | 11 ++++++++ 5 files changed, 43 insertions(+), 38 deletions(-) create mode 100644 pandas/tests/series/test_reductions.py diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6525e93d89fce..e1fc7e9d7c5b8 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1274,3 +1274,28 @@ def test_series_broadcasting(self): df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) + + +class TestDataFrameReductions: + def test_min_max_dt64_with_NaT(self): + # Both NaT and Timestamp are in DataFrame. + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) + + res = df.min() + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + tm.assert_series_equal(res, exp) + + # GH12941, only NaTs are in DataFrame. + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) + + res = df.min() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 452af895e4967..dea921a92ae37 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -54,29 +54,6 @@ def test_frame_append_datetime64_col_other_units(self): assert (tmp["dates"].values == ex_vals).all() - def test_operation_on_NaT(self): - # Both NaT and Timestamp are in DataFrame. - df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) - - res = df.min() - exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) - tm.assert_series_equal(res, exp) - - res = df.max() - exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) - tm.assert_series_equal(res, exp) - - # GH12941, only NaTs are in DataFrame. - df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) - - res = df.min() - exp = pd.Series([pd.NaT], index=["foo"]) - tm.assert_series_equal(res, exp) - - res = df.max() - exp = pd.Series([pd.NaT], index=["foo"]) - tm.assert_series_equal(res, exp) - def test_datetime_assignment_with_NaT_and_diff_time_units(self): # GH 7492 data_ns = np.array([1, "nat"], dtype="datetime64[ns]") diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 5882f5c77428b..58e2afc869e02 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -476,6 +476,13 @@ def test_get_loc_reasonable_key_error(self): index.get_loc("1/1/2000") +class TestContains: + def test_index_dupes_contains(self): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + assert d in ix + + class TestDatetimeIndex: @pytest.mark.parametrize( "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index b5d04fd499c08..18c11f2b9eb61 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -464,12 +464,6 @@ def test_index_unique(dups): assert idx.nunique(dropna=False) == 21 -def test_index_dupes_contains(): - d = datetime(2011, 12, 5, 20, 30) - ix = DatetimeIndex([d, d]) - assert d in ix - - def test_duplicate_dates_indexing(dups): ts = dups @@ -705,15 +699,6 @@ def test_set_none_nan(): assert series[6] is NaT -def test_nat_operations(): - # GH 8617 - s = Series([0, pd.NaT], dtype="m8[ns]") - exp = s[0] - assert s.median() == exp - assert s.min() == exp - assert s.max() == exp - - def test_setitem_tuple_with_datetimetz(): # GH 20441 arr = date_range("2017", periods=4, tz="US/Eastern") diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py new file mode 100644 index 0000000000000..be9330a14f9c9 --- /dev/null +++ b/pandas/tests/series/test_reductions.py @@ -0,0 +1,11 @@ +import pandas as pd +from pandas import Series + + +def test_reductions_td64_with_nat(): + # GH#8617 + ser = Series([0, pd.NaT], dtype="m8[ns]") + exp = ser[0] + assert ser.median() == exp + assert ser.min() == exp + assert ser.max() == exp From 7a468b01726dadb7f75c5b427822fe6da7e9c753 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 6 Apr 2020 16:58:58 -0500 Subject: [PATCH 12/29] BUG: Don't raise on value_counts for empty Int64 (#33339) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/integer.py | 3 ++- pandas/tests/arrays/integer/test_function.py | 10 ++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2df732d67b5da..92cfa6812ddd7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -461,7 +461,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- +- Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) - diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4f3c68aa03b16..f5189068d5da1 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -499,7 +499,8 @@ def _values_for_argsort(self) -> np.ndarray: ExtensionArray.argsort """ data = self._data.copy() - data[self._mask] = data.min() - 1 + if self._mask.any(): + data[self._mask] = data.min() - 1 return data @classmethod diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 58913189593a9..bdf902d1aca62 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -103,6 +103,16 @@ def test_value_counts_na(): tm.assert_series_equal(result, expected) +def test_value_counts_empty(): + # https://github.com/pandas-dev/pandas/issues/33317 + s = pd.Series([], dtype="Int64") + result = s.value_counts() + # TODO: The dtype of the index seems wrong (it's int64 for non-empty) + idx = pd.Index([], dtype="object") + expected = pd.Series([], index=idx, dtype="Int64") + tm.assert_series_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift From 0a2b9cdb54fde4f9e45ff10dd05bec7c238c19dc Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 6 Apr 2020 17:01:56 -0500 Subject: [PATCH 13/29] REGR: Fix bug when replacing categorical value with self (#33292) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/categorical.py | 2 ++ pandas/tests/arrays/categorical/test_algos.py | 2 ++ 3 files changed, 5 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 92cfa6812ddd7..170d0f7110aa4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -294,6 +294,7 @@ Categorical - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) - :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) +- Bug where :meth:`Categorical.replace` would replace with ``NaN`` whenever the new value and replacement value were equal (:issue:`33288`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ad82d68baa5b3..c9b8db28e0cf6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2447,6 +2447,8 @@ def replace(self, to_replace, value, inplace: bool = False): # other cases, like if both to_replace and value are list-like or if # to_replace is a dict, are handled separately in NDFrame for replace_value, new_value in replace_dict.items(): + if new_value == replace_value: + continue if replace_value in cat.categories: if isna(new_value): cat.remove_categories(replace_value, inplace=True) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 10c454f7c479a..325fa476d70e6 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -64,6 +64,8 @@ def test_isin_cats(): [ ("b", "c", ["a", "c"], "Categorical.categories are different"), ("c", "d", ["a", "b"], None), + # https://github.com/pandas-dev/pandas/issues/33288 + ("a", "a", ["a", "b"], None), ("b", None, ["a", None], "Categorical.categories length are different"), ], ) From 5a38119dcc363202845785fcf4b6dc7b54e1dd69 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Apr 2020 17:04:40 -0500 Subject: [PATCH 14/29] Pass method in __finalize__ (#33273) --- pandas/core/base.py | 2 +- pandas/core/frame.py | 8 ++--- pandas/core/generic.py | 77 +++++++++++++++++++++++++----------------- pandas/core/series.py | 62 ++++++++++++++++++++++------------ 4 files changed, 92 insertions(+), 57 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index a28a2c9594341..5945d8a4b432d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1521,4 +1521,4 @@ def duplicated(self, keep="first"): else: return self._constructor( duplicated(self, keep=keep), index=self.index - ).__finalize__(self) + ).__finalize__(self, method="duplicated") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 67523facb7b7d..aedbba755227d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2515,7 +2515,7 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": new_values, index=self.columns, columns=self.index ) - return result.__finalize__(self) + return result.__finalize__(self, method="transpose") @property def T(self) -> "DataFrame": @@ -4470,7 +4470,7 @@ def _maybe_casted_values(index, labels=None): @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self) -> "DataFrame": result = self._constructor(self._data.isna(func=isna)) - return result.__finalize__(self) + return result.__finalize__(self, method="isna") @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self) -> "DataFrame": @@ -4798,7 +4798,7 @@ def sort_values( if inplace: return self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="sort_values") def sort_index( self, @@ -4934,7 +4934,7 @@ def sort_index( if inplace: return self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="sort_index") def value_counts( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3363d22686f96..052a4adddca27 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -590,7 +590,9 @@ def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: if copy: new_values = new_values.copy() - return self._constructor(new_values, *new_axes).__finalize__(self) + return self._constructor(new_values, *new_axes).__finalize__( + self, method="swapaxes" + ) def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ @@ -993,7 +995,7 @@ def rename( self._update_inplace(result) return None else: - return result.__finalize__(self) + return result.__finalize__(self, method="rename") @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) def rename_axis(self, mapper=lib.no_default, **kwargs): @@ -1357,7 +1359,7 @@ def __invert__(self): return self new_data = self._mgr.apply(operator.invert) - result = self._constructor(new_data).__finalize__(self) + result = self._constructor(new_data).__finalize__(self, method="__invert__") return result def __nonzero__(self): @@ -1802,7 +1804,9 @@ def __array_wrap__(self, result, context=None): # ptp also requires the item_from_zerodim return result d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) - return self._constructor(result, **d).__finalize__(self) + return self._constructor(result, **d).__finalize__( + self, method="__array_wrap__" + ) # ideally we would define this to avoid the getattr checks, but # is slower @@ -3361,7 +3365,7 @@ class max_speed new_data = self._mgr.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self, method="take") def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries: """ @@ -4431,7 +4435,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: # perform the reindex on the axes return self._reindex_axes( axes, level, limit, tolerance, method, fill_value, copy - ).__finalize__(self) + ).__finalize__(self, method="reindex") def _reindex_axes( self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy @@ -5130,7 +5134,7 @@ def pipe(self, func, *args, **kwargs): # Attribute access def __finalize__( - self: FrameOrSeries, other, method=None, **kwargs + self: FrameOrSeries, other, method: Optional[str] = None, **kwargs ) -> FrameOrSeries: """ Propagate metadata from other to self. @@ -5139,9 +5143,14 @@ def __finalize__( ---------- other : the object from which to get the attributes that we are going to propagate - method : optional, a passed method name ; possibly to take different - types of propagation actions based on this + method : str, optional + A passed method name providing context on where ``__finalize__`` + was called. + + .. warning: + The value passed as `method` are not currently considered + stable across pandas releases. """ if isinstance(other, NDFrame): for name in other.attrs: @@ -5294,10 +5303,10 @@ def _check_inplace_setting(self, value) -> bool_t: return True def _get_numeric_data(self): - return self._constructor(self._mgr.get_numeric_data()).__finalize__(self) + return self._constructor(self._mgr.get_numeric_data()).__finalize__(self,) def _get_bool_data(self): - return self._constructor(self._mgr.get_bool_data()).__finalize__(self) + return self._constructor(self._mgr.get_bool_data()).__finalize__(self,) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -5563,8 +5572,8 @@ def astype( else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) - return self._constructor(new_data).__finalize__(self) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) + return self._constructor(new_data).__finalize__(self, method="astype") # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) @@ -5678,7 +5687,7 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: """ data = self._mgr.copy(deep=deep) self._clear_item_cache() - return self._constructor(data).__finalize__(self) + return self._constructor(data).__finalize__(self, method="copy") def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: return self.copy(deep=deep) @@ -5784,7 +5793,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: self._mgr.convert( datetime=True, numeric=False, timedelta=True, coerce=False, copy=True ) - ).__finalize__(self) + ).__finalize__(self, method="infer_objects") def convert_dtypes( self: FrameOrSeries, @@ -6111,7 +6120,7 @@ def fillna( if inplace: return self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="fillna") def ffill( self: FrameOrSeries, @@ -6627,7 +6636,7 @@ def replace( if inplace: return self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="replace") _shared_docs[ "interpolate" @@ -6893,7 +6902,7 @@ def interpolate( if inplace: return self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="interpolate") # ---------------------------------------------------------------------- # Timeseries methods Methods @@ -7131,11 +7140,11 @@ def asof(self, where, subset=None): @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self: FrameOrSeries) -> FrameOrSeries: - return isna(self).__finalize__(self) + return isna(self).__finalize__(self, method="isna") @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self: FrameOrSeries) -> FrameOrSeries: - return isna(self).__finalize__(self) + return isna(self).__finalize__(self, method="isnull") _shared_docs[ "notna" @@ -7201,11 +7210,11 @@ def isnull(self: FrameOrSeries) -> FrameOrSeries: @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self: FrameOrSeries) -> FrameOrSeries: - return notna(self).__finalize__(self) + return notna(self).__finalize__(self, method="notna") @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self: FrameOrSeries) -> FrameOrSeries: - return notna(self).__finalize__(self) + return notna(self).__finalize__(self, method="notnull") def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if (lower is not None and np.any(isna(lower))) or ( @@ -8229,7 +8238,7 @@ def ranker(data): pct=pct, ) ranks = self._constructor(ranks, **data._construct_axes_dict()) - return ranks.__finalize__(self) + return ranks.__finalize__(self, method="rank") # if numeric_only is None, and we can't get anything, we try with # numeric_only=True @@ -8436,7 +8445,10 @@ def _align_frame( left.index = join_index right.index = join_index - return left.__finalize__(self), right.__finalize__(other) + return ( + left.__finalize__(self), + right.__finalize__(other), + ) def _align_series( self, @@ -8520,7 +8532,10 @@ def _align_series( left.index = join_index right.index = join_index - return left.__finalize__(self), right.__finalize__(other) + return ( + left.__finalize__(self), + right.__finalize__(other), + ) def _where( self, @@ -8933,7 +8948,7 @@ def shift( else: return self.tshift(periods, freq) - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self, method="shift") def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ @@ -8970,7 +8985,7 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: shifted_axis = self._get_axis(axis)[islicer] new_obj.set_axis(shifted_axis, axis=axis, inplace=True) - return new_obj.__finalize__(self) + return new_obj.__finalize__(self, method="slice_shift") def tshift( self: FrameOrSeries, periods: int = 1, freq=None, axis: Axis = 0 @@ -9030,7 +9045,7 @@ def tshift( result = self.copy() result.set_axis(new_ax, axis, inplace=True) - return result.__finalize__(self) + return result.__finalize__(self, method="tshift") def truncate( self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True @@ -9241,7 +9256,7 @@ def _tz_convert(ax, tz): result = self.copy(deep=copy) result = result.set_axis(ax, axis=axis, inplace=False) - return result.__finalize__(self) + return result.__finalize__(self, method="tz_convert") def tz_localize( self: FrameOrSeries, @@ -9410,7 +9425,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): result = self.copy(deep=copy) result = result.set_axis(ax, axis=axis, inplace=False) - return result.__finalize__(self) + return result.__finalize__(self, method="tz_localize") # ---------------------------------------------------------------------- # Numeric Methods @@ -11189,7 +11204,7 @@ def block_accum_func(blk_values): d = self._construct_axes_dict() d["copy"] = False - return self._constructor(result, **d).__finalize__(self) + return self._constructor(result, **d).__finalize__(self, method=name) return set_function_name(cum_func, name, cls) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5ed8241101925..ccb1ec25b5ba4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -661,7 +661,7 @@ def view(self, dtype=None) -> "Series": """ return self._constructor( self._values.view(dtype), index=self.index - ).__finalize__(self) + ).__finalize__(self, method="view") # ---------------------------------------------------------------------- # NDArray Compat @@ -829,7 +829,7 @@ def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series": return self._constructor( new_values, index=new_index, fastpath=True - ).__finalize__(self) + ).__finalize__(self, method="take") def _take_with_is_copy(self, indices, axis=0): """ @@ -962,12 +962,12 @@ def _get_values_tuple(self, key): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self._values[indexer], index=new_index).__finalize__( - self + self, ) def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) + return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self,) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack @@ -1181,7 +1181,9 @@ def repeat(self, repeats, axis=None) -> "Series": nv.validate_repeat(tuple(), dict(axis=axis)) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) - return self._constructor(new_values, index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__( + self, method="repeat" + ) def reset_index(self, level=None, drop=False, name=None, inplace=False): """ @@ -1308,7 +1310,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): else: return self._constructor( self._values.copy(), index=new_index - ).__finalize__(self) + ).__finalize__(self, method="reset_index") elif inplace: raise TypeError( "Cannot reset_index inplace on a Series to create a DataFrame" @@ -1707,7 +1709,9 @@ def count(self, level=None): obs = level_codes[notna(self._values)] out = np.bincount(obs, minlength=len(lev) or None) - return self._constructor(out, index=lev, dtype="int64").__finalize__(self) + return self._constructor(out, index=lev, dtype="int64").__finalize__( + self, method="count" + ) def mode(self, dropna=True) -> "Series": """ @@ -2130,7 +2134,9 @@ def round(self, decimals=0, *args, **kwargs) -> "Series": """ nv.validate_round(args, kwargs) result = self._values.round(decimals) - result = self._constructor(result, index=self.index).__finalize__(self) + result = self._constructor(result, index=self.index).__finalize__( + self, method="round" + ) return result @@ -2352,7 +2358,9 @@ def diff(self, periods: int = 1) -> "Series": dtype: float64 """ result = algorithms.diff(self.array, periods) - return self._constructor(result, index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__( + self, method="diff" + ) def autocorr(self, lag=1) -> float: """ @@ -2469,7 +2477,7 @@ def dot(self, other): if isinstance(other, ABCDataFrame): return self._constructor( np.dot(lvals, rvals), index=other.columns - ).__finalize__(self) + ).__finalize__(self, method="dot") elif isinstance(other, Series): return np.dot(lvals, rvals) elif isinstance(rvals, np.ndarray): @@ -2994,7 +3002,7 @@ def _try_kind_sort(arr): if inplace: self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="sort_values") def sort_index( self, @@ -3172,7 +3180,7 @@ def sort_index( if inplace: self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="sort_index") def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ @@ -3206,11 +3214,13 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": result = Series(-1, index=self.index, name=self.name, dtype="int64") notmask = ~mask result[notmask] = np.argsort(values[notmask], kind=kind) - return self._constructor(result, index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__( + self, method="argsort" + ) else: return self._constructor( np.argsort(values, kind=kind), index=self.index, dtype="int64" - ).__finalize__(self) + ).__finalize__(self, method="argsort") def nlargest(self, n=5, keep="first") -> "Series": """ @@ -3428,7 +3438,7 @@ def swaplevel(self, i=-2, j=-1, copy=True) -> "Series": assert isinstance(self.index, ABCMultiIndex) new_index = self.index.swaplevel(i, j) return self._constructor(self._values, index=new_index, copy=copy).__finalize__( - self + self, method="swaplevel" ) def reorder_levels(self, order) -> "Series": @@ -3632,7 +3642,9 @@ def map(self, arg, na_action=None) -> "Series": dtype: object """ new_values = super()._map_values(arg, na_action=na_action) - return self._constructor(new_values, index=self.index).__finalize__(self) + return self._constructor(new_values, index=self.index).__finalize__( + self, method="map" + ) def _gotitem(self, key, ndim, subset=None) -> "Series": """ @@ -3819,7 +3831,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): """ if len(self) == 0: return self._constructor(dtype=self.dtype, index=self.index).__finalize__( - self + self, method="apply" ) # dispatch to agg @@ -3856,7 +3868,9 @@ def f(x): # so extension arrays can be used return self._constructor_expanddim(pd.array(mapped), index=self.index) else: - return self._constructor(mapped, index=self.index).__finalize__(self) + return self._constructor(mapped, index=self.index).__finalize__( + self, method="apply" + ) def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds @@ -4297,7 +4311,9 @@ def isin(self, values) -> "Series": Name: animal, dtype: bool """ result = algorithms.isin(self, values) - return self._constructor(result, index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__( + self, method="isin" + ) def between(self, left, right, inclusive=True) -> "Series": """ @@ -4533,7 +4549,9 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": assert isinstance(self.index, (ABCDatetimeIndex, ABCPeriodIndex)) new_index = self.index.to_timestamp(freq=freq, how=how) - return self._constructor(new_values, index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__( + self, method="to_timestamp" + ) def to_period(self, freq=None, copy=True) -> "Series": """ @@ -4558,7 +4576,9 @@ def to_period(self, freq=None, copy=True) -> "Series": assert isinstance(self.index, ABCDatetimeIndex) new_index = self.index.to_period(freq=freq) - return self._constructor(new_values, index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__( + self, method="to_period" + ) # ---------------------------------------------------------------------- # Add index From 4f1fb462636e815f6798ef43f8eb25ad794c5773 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 7 Apr 2020 01:10:18 +0300 Subject: [PATCH 15/29] DOC: Added an example for each series.dt field accessor (#33259) --- pandas/core/arrays/datetimes.py | 128 ++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index be2ac8c22bc8a..b9f9edcebad5b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1239,6 +1239,22 @@ def date(self): "Y", """ The year of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="Y") + ... ) + >>> datetime_series + 0 2000-12-31 + 1 2001-12-31 + 2 2002-12-31 + dtype: datetime64[ns] + >>> datetime_series.dt.year + 0 2000 + 1 2001 + 2 2002 + dtype: int64 """, ) month = _field_accessor( @@ -1246,6 +1262,22 @@ def date(self): "M", """ The month as January=1, December=12. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="M") + ... ) + >>> datetime_series + 0 2000-01-31 + 1 2000-02-29 + 2 2000-03-31 + dtype: datetime64[ns] + >>> datetime_series.dt.month + 0 1 + 1 2 + 2 3 + dtype: int64 """, ) day = _field_accessor( @@ -1253,6 +1285,22 @@ def date(self): "D", """ The day of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="D") + ... ) + >>> datetime_series + 0 2000-01-01 + 1 2000-01-02 + 2 2000-01-03 + dtype: datetime64[ns] + >>> datetime_series.dt.day + 0 1 + 1 2 + 2 3 + dtype: int64 """, ) hour = _field_accessor( @@ -1260,6 +1308,22 @@ def date(self): "h", """ The hours of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="h") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: datetime64[ns] + >>> datetime_series.dt.hour + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) minute = _field_accessor( @@ -1267,6 +1331,22 @@ def date(self): "m", """ The minutes of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="T") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:01:00 + 2 2000-01-01 00:02:00 + dtype: datetime64[ns] + >>> datetime_series.dt.minute + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) second = _field_accessor( @@ -1274,6 +1354,22 @@ def date(self): "s", """ The seconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="s") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: datetime64[ns] + >>> datetime_series.dt.second + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) microsecond = _field_accessor( @@ -1281,6 +1377,22 @@ def date(self): "us", """ The microseconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="us") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00.000000 + 1 2000-01-01 00:00:00.000001 + 2 2000-01-01 00:00:00.000002 + dtype: datetime64[ns] + >>> datetime_series.dt.microsecond + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) nanosecond = _field_accessor( @@ -1288,6 +1400,22 @@ def date(self): "ns", """ The nanoseconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="ns") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00.000000000 + 1 2000-01-01 00:00:00.000000001 + 2 2000-01-01 00:00:00.000000002 + dtype: datetime64[ns] + >>> datetime_series.dt.nanosecond + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) weekofyear = _field_accessor( From 8150c11db0c21cb8604c7133d2571070cb725787 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 15:16:27 -0700 Subject: [PATCH 16/29] BUG: Timestamp+- ndarray[td64] (#33296) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/c_timestamp.pyx | 14 ++++ .../tests/scalar/timestamp/test_arithmetic.py | 73 +++++++++++++------ 3 files changed, 65 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 170d0f7110aa4..cbfc6d63e8ea3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -306,6 +306,7 @@ Datetimelike - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) - Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) - Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`) +- Bug in :class:`Timestamp` arithmetic when adding or subtracting a ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 3c30460a74ece..04fadf220388f 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -253,6 +253,13 @@ cdef class _Timestamp(datetime): elif is_array(other): if other.dtype.kind in ['i', 'u']: raise integer_op_not_supported(self) + if other.dtype.kind == "m": + if self.tz is None: + return self.asm8 + other + return np.asarray( + [self + other[n] for n in range(len(other))], + dtype=object, + ) # index/series like elif hasattr(other, '_typ'): @@ -275,6 +282,13 @@ cdef class _Timestamp(datetime): elif is_array(other): if other.dtype.kind in ['i', 'u']: raise integer_op_not_supported(self) + if other.dtype.kind == "m": + if self.tz is None: + return self.asm8 - other + return np.asarray( + [self - other[n] for n in range(len(other))], + dtype=object, + ) typ = getattr(other, '_typ', None) if typ is not None: diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index ee70d1d0432fc..b038ee1aee106 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -6,6 +6,7 @@ from pandas.errors import OutOfBoundsDatetime from pandas import Timedelta, Timestamp +import pandas._testing as tm from pandas.tseries import offsets from pandas.tseries.frequencies import to_offset @@ -177,29 +178,6 @@ def test_timestamp_add_timedelta64_unit(self, other, expected_difference): valdiff = result.value - ts.value assert valdiff == expected_difference - @pytest.mark.parametrize("ts", [Timestamp.now(), Timestamp.now("utc")]) - @pytest.mark.parametrize( - "other", - [ - 1, - np.int64(1), - np.array([1, 2], dtype=np.int32), - np.array([3, 4], dtype=np.uint64), - ], - ) - def test_add_int_no_freq_raises(self, ts, other): - msg = "Addition/subtraction of integers and integer-arrays" - with pytest.raises(TypeError, match=msg): - ts + other - with pytest.raises(TypeError, match=msg): - other + ts - - with pytest.raises(TypeError, match=msg): - ts - other - msg = "unsupported operand type" - with pytest.raises(TypeError, match=msg): - other - ts - @pytest.mark.parametrize( "ts", [ @@ -229,3 +207,52 @@ def test_add_int_with_freq(self, ts, other): msg = "unsupported operand type" with pytest.raises(TypeError, match=msg): other - ts + + @pytest.mark.parametrize("shape", [(6,), (2, 3,)]) + def test_addsub_m8ndarray(self, shape): + # GH#33296 + ts = Timestamp("2020-04-04 15:45") + other = np.arange(6).astype("m8[h]").reshape(shape) + + result = ts + other + + ex_stamps = [ts + Timedelta(hours=n) for n in range(6)] + expected = np.array([x.asm8 for x in ex_stamps], dtype="M8[ns]").reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + result = other + ts + tm.assert_numpy_array_equal(result, expected) + + result = ts - other + ex_stamps = [ts - Timedelta(hours=n) for n in range(6)] + expected = np.array([x.asm8 for x in ex_stamps], dtype="M8[ns]").reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): + other - ts + + @pytest.mark.parametrize("shape", [(6,), (2, 3,)]) + def test_addsub_m8ndarray_tzaware(self, shape): + # GH#33296 + ts = Timestamp("2020-04-04 15:45", tz="US/Pacific") + + other = np.arange(6).astype("m8[h]").reshape(shape) + + result = ts + other + + ex_stamps = [ts + Timedelta(hours=n) for n in range(6)] + expected = np.array(ex_stamps).reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + result = other + ts + tm.assert_numpy_array_equal(result, expected) + + result = ts - other + ex_stamps = [ts - Timedelta(hours=n) for n in range(6)] + expected = np.array(ex_stamps).reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): + other - ts From 9585ae424c2eb0b05d94232ca5b5df09111c14cd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 15:17:18 -0700 Subject: [PATCH 17/29] BUG: 2D indexing on DTA/TDA/PA (#33290) --- pandas/core/arrays/datetimelike.py | 12 ++-------- pandas/core/indexes/extension.py | 5 +++- pandas/tests/arrays/test_datetimelike.py | 29 ++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c0bbbebac7c33..4fabd8f558fee 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -550,10 +550,7 @@ def __getitem__(self, key): key = np.asarray(key, dtype=bool) key = check_array_indexer(self, key) - if key.all(): - key = slice(0, None, None) - else: - key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): # see https://github.com/pandas-dev/pandas/issues/31299, need to allow # this for now (would otherwise raise in check_array_indexer) @@ -561,7 +558,7 @@ def __getitem__(self, key): else: key = check_array_indexer(self, key) - is_period = is_period_dtype(self) + is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq else: @@ -577,11 +574,6 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) - if result.ndim > 1: - # To support MPL which performs slicing with 2 dim - # even though it only has 1 dim by definition - return result - return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index f38a4fb83c64f..c752990531b34 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -214,7 +214,10 @@ class ExtensionIndex(Index): def __getitem__(self, key): result = self._data[key] if isinstance(result, type(self._data)): - return type(self)(result, name=self.name) + if result.ndim == 1: + return type(self)(result, name=self.name) + # Unpack to ndarray for MPL compat + result = result._data # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 83995ab26cb56..fe35344f46688 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -60,6 +60,12 @@ def timedelta_index(request): class SharedTests: index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + @pytest.fixture + def arr1d(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + return arr + def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific # to the case where one has length-1, which numpy would broadcast @@ -204,6 +210,18 @@ def test_searchsorted(self): result = arr.searchsorted(pd.NaT) assert result == 0 + def test_getitem_2d(self, arr1d): + # 2d slicing on a 1D array + expected = type(arr1d)(arr1d._data[:, np.newaxis], dtype=arr1d.dtype) + result = arr1d[:, np.newaxis] + tm.assert_equal(result, expected) + + # Lookup on a 2D array + arr2d = expected + expected = type(arr2d)(arr2d._data[:3, 0], dtype=arr2d.dtype) + result = arr2d[:3, 0] + tm.assert_equal(result, expected) + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") @@ -265,6 +283,13 @@ class TestDatetimeArray(SharedTests): array_cls = DatetimeArray dtype = pd.Timestamp + @pytest.fixture + def arr1d(self, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) + dta = dti._data + return dta + def test_round(self, tz_naive_fixture): # GH#24064 tz = tz_naive_fixture @@ -645,6 +670,10 @@ class TestPeriodArray(SharedTests): array_cls = PeriodArray dtype = pd.Period + @pytest.fixture + def arr1d(self, period_index): + return period_index._data + def test_from_pi(self, period_index): pi = period_index arr = PeriodArray(pi) From c05d28b9b25918ab1013db947e9beada9d55fce9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 16:19:10 -0700 Subject: [PATCH 18/29] REF: dispatch TDBlock.to_native_types to TDA._format_native_types (#33270) --- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/construction.py | 2 +- pandas/core/indexes/accessors.py | 16 ++++++++-------- pandas/core/internals/blocks.py | 22 +++------------------- pandas/core/tools/timedeltas.py | 4 ++-- pandas/io/formats/format.py | 5 ----- pandas/tests/frame/test_dtypes.py | 7 +------ pandas/tests/io/formats/test_format.py | 6 +++--- pandas/tests/series/test_dtypes.py | 11 +++++++---- 9 files changed, 26 insertions(+), 49 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a9c8977991740..8c93dca783113 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -389,7 +389,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) - return np.array([formatter(x) for x in self._data]) + return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape) # ---------------------------------------------------------------- # Arithmetic Methods diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c9754ff588896..2d60ad9ba50bf 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -187,7 +187,7 @@ def array( >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') - ['01:00:00', '02:00:00'] + ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] Examples diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2908d468bcae0..d2cee5d94422c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -241,9 +241,9 @@ class TimedeltaProperties(Properties): ... pd.timedelta_range(start="1 second", periods=3, freq="S") ... ) >>> seconds_series - 0 00:00:01 - 1 00:00:02 - 2 00:00:03 + 0 0 days 00:00:01 + 1 0 days 00:00:02 + 2 0 days 00:00:03 dtype: timedelta64[ns] >>> seconds_series.dt.seconds 0 1 @@ -301,11 +301,11 @@ def components(self): -------- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) >>> s - 0 00:00:00 - 1 00:00:01 - 2 00:00:02 - 3 00:00:03 - 4 00:00:04 + 0 0 days 00:00:00 + 1 0 days 00:00:01 + 2 0 days 00:00:02 + 3 0 days 00:00:03 + 4 0 days 00:00:04 dtype: timedelta64[ns] >>> s.dt.components days hours minutes seconds milliseconds microseconds nanoseconds diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c23f78d845cfd..ba2fd037901a2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2357,26 +2357,10 @@ def fillna(self, value, **kwargs): ) return super().fillna(value, **kwargs) - def to_native_types(self, na_rep=None, quoting=None, **kwargs): + def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ - values = self.values - mask = isna(values) - - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = "NaT" - rvalues[mask] = na_rep - imask = (~mask).ravel() - - # FIXME: - # should use the formats.format.Timedelta64Formatter here - # to figure what format to pass to the Timedelta - # e.g. to not show the decimals say - rvalues.flat[imask] = np.array( - [Timedelta(val)._repr_base(format="all") for val in values.ravel()[imask]], - dtype=object, - ) - return rvalues + tda = self.array_values() + return tda._format_native_types(na_rep, **kwargs) class BoolBlock(NumericBlock): diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 960a82caafeeb..48f30acf269da 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -69,8 +69,8 @@ def to_timedelta(arg, unit="ns", errors="raise"): Converting numbers by specifying the `unit` keyword argument: >>> pd.to_timedelta(np.arange(5), unit='s') - TimedeltaIndex(['00:00:00', '00:00:01', '00:00:02', - '00:00:03', '00:00:04'], + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) >>> pd.to_timedelta(np.arange(5), unit='d') TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index a9e668312d751..59542a8da535e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1672,14 +1672,9 @@ def _get_format_timedelta64( even_days = ( np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 ) - all_sub_day = ( - np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 - ) if even_days: format = None - elif all_sub_day: - format = "sub_day" else: format = "long" diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 2cda4ba16f7ce..27ebee4aaaccf 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -248,12 +248,7 @@ def test_astype_str(self): { "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), - "c": list( - map( - str, - map(lambda x: Timedelta(x)._repr_base(format="all"), c._values), - ) - ), + "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), } diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 1a5d122d732a9..f3c3344992942 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3003,13 +3003,13 @@ def test_days_neg(self): def test_subdays(self): y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(y, box=True).get_result() - assert result[0].strip() == "'00:00:00'" - assert result[1].strip() == "'00:00:01'" + assert result[0].strip() == "'0 days 00:00:00'" + assert result[1].strip() == "'0 days 00:00:01'" def test_subdays_neg(self): y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(-y, box=True).get_result() - assert result[0].strip() == "'00:00:00'" + assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2f2a663d559d0..05e708e575a64 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -132,7 +132,7 @@ def test_astype_str_map(self, dtype, series): expected = series.map(str) tm.assert_series_equal(result, expected) - def test_astype_str_cast(self): + def test_astype_str_cast_dt64(self): # see gh-9757 ts = Series([Timestamp("2010-01-04 00:00:00")]) s = ts.astype(str) @@ -146,11 +146,14 @@ def test_astype_str_cast(self): expected = Series([str("2010-01-04 00:00:00-05:00")]) tm.assert_series_equal(s, expected) + def test_astype_str_cast_td64(self): + # see gh-9757 + td = Series([Timedelta(1, unit="d")]) - s = td.astype(str) + ser = td.astype(str) - expected = Series([str("1 days 00:00:00.000000000")]) - tm.assert_series_equal(s, expected) + expected = Series([str("1 days")]) + tm.assert_series_equal(ser, expected) def test_astype_unicode(self): # see gh-7758: A bit of magic is required to set From 047e5d7620644e3493aac02f85fc2e78f2ed586b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 16:20:24 -0700 Subject: [PATCH 19/29] REF: put concatenate_block_managers in internals.concat (#33231) --- pandas/core/internals/__init__.py | 7 ++-- pandas/core/internals/concat.py | 58 ++++++++++++++++++++++++++++--- pandas/core/internals/managers.py | 47 ------------------------- 3 files changed, 55 insertions(+), 57 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index bc45b7c74ecc1..1090f862acb8a 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -14,11 +14,8 @@ _safe_reshape, make_block, ) -from pandas.core.internals.managers import ( - BlockManager, - SingleBlockManager, - concatenate_block_managers, -) +from pandas.core.internals.concat import concatenate_block_managers +from pandas.core.internals.managers import BlockManager, SingleBlockManager __all__ = [ "Block", diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 6839d138fbf73..720e6799a3bf3 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -23,9 +23,57 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos +from pandas.core.internals.blocks import make_block +from pandas.core.internals.managers import BlockManager -def get_mgr_concatenation_plan(mgr, indexers): +def concatenate_block_managers( + mgrs_indexers, axes, concat_axis: int, copy: bool +) -> BlockManager: + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + Returns + ------- + BlockManager + """ + concat_plans = [ + _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] + concat_plan = _combine_concat_plans(concat_plans, concat_axis) + blocks = [] + + for placement, join_units in concat_plan: + + if len(join_units) == 1 and not join_units[0].indexers: + b = join_units[0].block + values = b.values + if copy: + values = values.copy() + else: + values = values.view() + b = b.make_block_same_class(values, placement=placement) + elif _is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type([ju.block for ju in join_units]) + b.mgr_locs = placement + else: + b = make_block( + _concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement, + ) + blocks.append(b) + + return BlockManager(blocks, axes) + + +def _get_mgr_concatenation_plan(mgr, indexers): """ Construct concatenation plan for given block manager and indexers. @@ -232,7 +280,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return values -def concatenate_join_units(join_units, concat_axis, copy): +def _concatenate_join_units(join_units, concat_axis, copy): """ Concatenate values from several join units along selected axis. """ @@ -371,11 +419,11 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) -def is_uniform_join_units(join_units) -> bool: +def _is_uniform_join_units(join_units) -> bool: """ Check if the join units consist of blocks of uniform type that can be concatenated using Block.concat_same_type instead of the generic - concatenate_join_units (which uses `concat_compat`). + _concatenate_join_units (which uses `concat_compat`). """ return ( @@ -429,7 +477,7 @@ def _trim_join_unit(join_unit, length): return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) -def combine_concat_plans(plans, concat_axis): +def _combine_concat_plans(plans, concat_axis): """ Combine multiple concatenation plans into one. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9191c2f0a0a76..b0363dd21f616 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -47,12 +47,6 @@ get_block_type, make_block, ) -from pandas.core.internals.concat import ( # all for concatenate_block_managers - combine_concat_plans, - concatenate_join_units, - get_mgr_concatenation_plan, - is_uniform_join_units, -) from pandas.io.formats.printing import pprint_thing @@ -1974,44 +1968,3 @@ def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): if not allow_fill: indexer = maybe_convert_indices(indexer, length) return "fancy", indexer, len(indexer) - - -def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): - """ - Concatenate block managers into one. - - Parameters - ---------- - mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples - axes : list of Index - concat_axis : int - copy : bool - - """ - concat_plans = [ - get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers - ] - concat_plan = combine_concat_plans(concat_plans, concat_axis) - blocks = [] - - for placement, join_units in concat_plan: - - if len(join_units) == 1 and not join_units[0].indexers: - b = join_units[0].block - values = b.values - if copy: - values = values.copy() - else: - values = values.view() - b = b.make_block_same_class(values, placement=placement) - elif is_uniform_join_units(join_units): - b = join_units[0].block.concat_same_type([ju.block for ju in join_units]) - b.mgr_locs = placement - else: - b = make_block( - concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement, - ) - blocks.append(b) - - return BlockManager(blocks, axes) From 0e382f2f305e4e8a9fa476d5aff4299a9e3e02f6 Mon Sep 17 00:00:00 2001 From: mproszewska <38814059+mproszewska@users.noreply.github.com> Date: Tue, 7 Apr 2020 01:22:59 +0200 Subject: [PATCH 20/29] TST: Add tests for duplicated and drop_duplicates (#32575) --- .../indexes/categorical/test_category.py | 75 +++++++++++++++++-- pandas/tests/indexes/conftest.py | 9 +++ pandas/tests/indexes/datetimes/test_ops.py | 71 +++++++----------- pandas/tests/indexes/period/test_ops.py | 49 ++++++------ pandas/tests/indexes/timedeltas/test_ops.py | 50 +++++++------ 5 files changed, 159 insertions(+), 95 deletions(-) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 543edc6b66ff2..83fe21fd20bfe 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -292,16 +292,81 @@ def test_is_monotonic(self, data, non_lexsorted_data): assert c.is_monotonic_decreasing is False def test_has_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True - def test_drop_duplicates(self): + idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo") + assert idx.is_unique is False + assert idx.has_duplicates is True - idx = CategoricalIndex([0, 0, 0], name="foo") - expected = CategoricalIndex([0], name="foo") - tm.assert_index_equal(idx.drop_duplicates(), expected) + idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo") + assert idx.is_unique is True + assert idx.has_duplicates is False + + @pytest.mark.parametrize( + "data, categories, expected", + [ + ( + [1, 1, 1], + [1, 2, 3], + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [1, 1, 1], + list("abc"), + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [2, "a", "b"], + list("abc"), + { + "first": np.zeros(shape=(3), dtype=np.bool), + "last": np.zeros(shape=(3), dtype=np.bool), + False: np.zeros(shape=(3), dtype=np.bool), + }, + ), + ( + list("abb"), + list("abc"), + { + "first": np.array([False, False, True]), + "last": np.array([False, True, False]), + False: np.array([False, True, True]), + }, + ), + ], + ) + def test_drop_duplicates(self, data, categories, expected): + + idx = CategoricalIndex(data, categories=categories, name="foo") + for keep, e in expected.items(): + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e) + e = idx[~e] + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, e) + + @pytest.mark.parametrize( + "data, categories, expected_data, expected_categories", + [ + ([1, 1, 1], [1, 2, 3], [1], [1]), + ([1, 1, 1], list("abc"), [np.nan], []), + ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]), + ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]), + ], + ) + def test_unique(self, data, categories, expected_data, expected_categories): + + idx = CategoricalIndex(data, categories=categories) + expected = CategoricalIndex(expected_data, categories=expected_categories) tm.assert_index_equal(idx.unique(), expected) def test_repr_roundtrip(self): diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index a9fb228073ab4..fb17e1df6341b 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -16,3 +16,12 @@ def sort(request): in in the Index setops methods. """ return request.param + + +@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]) +def freq_sample(request): + """ + Valid values for 'freq' parameter used to create date_range and + timedelta_range.. + """ + return request.param diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index cbf6b7b63bd50..c55b0481c1041 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -264,9 +264,9 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - def test_drop_duplicates_metadata(self): + def test_drop_duplicates_metadata(self, freq_sample): # GH 10115 - idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -277,57 +277,38 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert result.freq is None - def test_drop_duplicates(self): + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq_sample, keep, expected, index): # to check Index/Series compat - base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx = base.append(base[:5]) + idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") + idx = idx.append(idx[:5]) - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) - @pytest.mark.parametrize( - "freq", - [ - "A", - "2A", - "-2A", - "Q", - "-1Q", - "M", - "-1M", - "D", - "3D", - "-3D", - "W", - "-1W", - "H", - "2H", - "-2H", - "T", - "2T", - "S", - "-3S", - ], - ) - def test_infer_freq(self, freq): + def test_infer_freq(self, freq_sample): # GH 11018 - idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10) + idx = pd.date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10) result = pd.DatetimeIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) - assert result.freq == freq + assert result.freq == freq_sample def test_nat(self, tz_naive_fixture): tz = tz_naive_fixture diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 196946e696c8d..fc44226f9d72f 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -81,9 +81,10 @@ def test_value_counts_unique(self): tm.assert_index_equal(idx.unique(), exp_idx) - def test_drop_duplicates_metadata(self): + @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + def test_drop_duplicates_metadata(self, freq): # GH 10115 - idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert idx.freq == result.freq - def test_drop_duplicates(self): + @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq, keep, expected, index): # to check Index/Series compat - base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx = base.append(base[:5]) - - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) - - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) - - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") + idx = idx.append(idx[:5]) + + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] + + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) + + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) def test_order_compat(self): def _check_freq(index, expected_index): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 4af5df6e2cc55..aa1bf997fc66b 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -134,9 +134,9 @@ def test_order(self): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - def test_drop_duplicates_metadata(self): + def test_drop_duplicates_metadata(self, freq_sample): # GH 10115 - idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") + idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -147,36 +147,38 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert result.freq is None - def test_drop_duplicates(self): + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq_sample, keep, expected, index): # to check Index/Series compat - base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") - idx = base.append(base[:5]) + idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") + idx = idx.append(idx[:5]) - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) - @pytest.mark.parametrize( - "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"] - ) - def test_infer_freq(self, freq): + def test_infer_freq(self, freq_sample): # GH#11018 - idx = pd.timedelta_range("1", freq=freq, periods=10) + idx = pd.timedelta_range("1", freq=freq_sample, periods=10) result = pd.TimedeltaIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) - assert result.freq == freq + assert result.freq == freq_sample def test_repeat(self): index = pd.timedelta_range("1 days", periods=2, freq="D") From 717662bf1e9141d6d3e752ea9a0c8c5ca966b284 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Mon, 6 Apr 2020 16:26:48 -0700 Subject: [PATCH 21/29] Ods loses spaces 32207 (#33233) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/excel/_odfreader.py | 27 +++++++++++++++++++- pandas/tests/io/data/excel/test_spaces.ods | Bin 0 -> 9263 bytes pandas/tests/io/data/excel/test_spaces.xls | Bin 0 -> 5632 bytes pandas/tests/io/data/excel/test_spaces.xlsb | Bin 0 -> 8036 bytes pandas/tests/io/data/excel/test_spaces.xlsm | Bin 0 -> 4848 bytes pandas/tests/io/data/excel/test_spaces.xlsx | Bin 0 -> 8622 bytes pandas/tests/io/excel/test_readers.py | 18 +++++++++++++ 8 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/excel/test_spaces.ods create mode 100644 pandas/tests/io/data/excel/test_spaces.xls create mode 100644 pandas/tests/io/data/excel/test_spaces.xlsb create mode 100644 pandas/tests/io/data/excel/test_spaces.xlsm create mode 100644 pandas/tests/io/data/excel/test_spaces.xlsx diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cbfc6d63e8ea3..6bb22f4c16aa1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -414,6 +414,7 @@ I/O - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) - Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) +- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 7af776dc1a10f..739c77d1c0b99 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -171,7 +171,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "string": - return str(cell) + return self._get_cell_string_value(cell) elif cell_type == "currency": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) @@ -182,3 +182,28 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: return pd.to_datetime(str(cell)).time() else: raise ValueError(f"Unrecognized type {cell_type}") + + def _get_cell_string_value(self, cell) -> str: + """ + Find and decode OpenDocument text:s tags that represent + a run length encoded sequence of space characters. + """ + from odf.element import Text, Element + from odf.text import S, P + from odf.namespaces import TEXTNS + + text_p = P().qname + text_s = S().qname + + p = cell.childNodes[0] + + value = [] + if p.qname == text_p: + for k, fragment in enumerate(p.childNodes): + if isinstance(fragment, Text): + value.append(fragment.data) + elif isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + value.append(" " * spaces) + return "".join(value) diff --git a/pandas/tests/io/data/excel/test_spaces.ods b/pandas/tests/io/data/excel/test_spaces.ods new file mode 100644 index 0000000000000000000000000000000000000000..375e839c8c22105e09c86ec28d65268435119a1c GIT binary patch literal 9263 zcmdsdbzD^2*FGq%C?z0*N=Zse4kDcn-Q6(65HqB7mvl+DGIV##5F_1P(j}lY@`HPS zzIyN1d*Ao{_g(YZbM~1z>)HG4v(8$3J$uVZ-$o%oLc%~os*h2X^Rwg$W;N`mvet)yAWYUa#$Y3BLwhS@Fq8>mV`r>y1Tix< zhRXc~vu4DT8XFS{>8220V=9^1TN!}$L6#6^=#MIs4cIhDPDUIXiv;WX5^PBc5rymH z8WIvR62|T8ng?lQG7{3Q5V_Y%qS$z(gt)lWDI)b6C?*1w z5`Qf%Edo?}Eu#Qblm{wnD2R(GN{hAMZZf)ii>&|pOBmolbDno_AVnP zB|9Z8BR)MRJ_DANk&~1KOOFlBObE|TipfY#&dEqF$WG45$GaS%grp#%P1-; zN`GITRajnFTvAw43@fR~FRv}GYc6@8US5z>T9{W=R#;aKE2}84sH&-`ZK$a%t*@=B zudgqvX)UjBt7&SlYJpd`bTzfZ>)U&py84^Dhg+(P+Uv^On(98bHukhsBiiZ*I-3T& zn%mo2+TmTDJ)gSa9iO^85C}xar{SKy;oiZKPs3Bgh_<2Lo{8R$k%8Xnk=~J!k-@Qv zk;&P~v7zaSk?HB_v6-dmxy6~q<(Z}Rg{h&%x#^{a+0UOpFRpH`Z0@YDu54|rZf z5)#@=NfALMr>UK(Ek%4)Vst%CmvYqU(kgi-WI^m9xri7%AqpJ4&kFJhLmF&m1k7(c zBE`@d6CY%MLP2N97OWVezf0Ibg~9-mkP?&hp4JD}wQrxksS82{b>2VHoVai>;i;!d zy5FD7wdU5ZKOT9)J=R#_DB;<+ER7iNFRsGqKc#9lqm8@Jm;0=yv&T)_pKBZoP)FwN z9?SXm5_mq^dk@U8cR0sH^Og&BGr4AiI%}xOXl|ofS)9ZgV^H0uB3FttbFAAR61q7> zpc|({DutgOozvY$LH=E|ZSZ41-V&wn+vo@jA9 zj$7|o>*^;3F--T@r%%eda-Z0QE9}b8Y99!UbEG-B?S!wLx4xz9ai_arC6 zfJa8i7yRrGS@%ZnYN+9o%O3Qz_}^;_UiY30uAStoS}Wy_;fwlI;IEHo?N4; zc5S4>Fy&TluyX|H+RNuhR5~Ca0j6auWV0OyP4JfrX*kImYvK%!m(D(+Q;!Mzt3Z#M zPJ*0XU95hT$=OZCfgk35U|ZalnsB)vm-oJHiHtJWAO0wr!MJTPlLq4N$9mtb%+6Fn zshV~LmbEz*{I&ZlURIvkXwJwAx8I@`N58LGro%xkyM$WXK?eY#JIsjde>mTHUvQfS zqpOzc<7|`J_kya!oeR5)&2s<~ir@NnQ4PErb!BWgjN$xK_F9xJi%7~Ct08aC^A9x~ zTC3BamJ(4A`R{coH=bop#m5|v^*>Q+9Ae;GQ>9D;fbzhX24~cUd9trx7vkB;$jMyA zo1Ril8WlOrP3}0icLGM6pa$mQmVTJslf~onR3Eb*X?zZ=)_DH1Tv3>u-FdwUR=gV$ zaG~BPju(plzGTQuYgG6!az}j&MvIND@o|D#>{gO)R#G+n5a@dzRdpf~2YS zGl*$*L~NNXRhT}1m zgYF!}Je`#9i&O*z$X$10Ls*#r(($Ocn9`Cb)( zFspN@55spHn~Z8mB1=tx@<&`81{1{EYm30Qf~T<0*&p_$iO7S`TvcA=R!2s8Zk)F* zr1)#`zC$F;Ne8>jo`1zK*0G)HydQ0)8!taH&9!?xU5Lg3)2y|AaShcsD+!M zxv(~U^?Y+F$p>Omx-xk7=84iI*{#5_&~t0WOdE~nB_+t`kZ{g7Y>DoRDeCM7QmmS< zgA9i@zBy&tBBo}HLOMIDKW#@L5Y-nvJVb6ivV;yt*9{41L>5kExL#arxqR1lCyqGeUEemo54GFc`UyZ)w{iNUpcIXq z?x1Ou$3hc(fypI?`FyN>JNI;KuP!vh<{#jY;LTfGaa;c|-d8l*u#~EcziOD*xFlq2uC>wLSh&!2Aa$-?;g-E0MuyNyL9v3tB4K| z&GwxRt{bK!lrvUmo2TOg+PK{CUeF!u-PX||5b4;X<;j|)t_q;NJ6IS$tU#ZEsfTUA z3)RU1-Ax>Zh*b!>zh6gC_F)Ow-&>fcCBW-Ip;k`(el;c$9p82JfV{cTpHED%nNro4 z2RUn#-6ewS3abMb1wG?~1=h>=ZYJrxWKdm?+b6=RgB(=rif!@Ks~ak}`@a_Qm>HK0 zmqpl7fg4)^!i{1%1I4%}k1}5k(^j(aqPbBqe#*+_*BM1I!n8HJq;@K%s3~t-edUJOm#X>tGj;ddkB^%zK5?WLZgW??dk!iAdmyL(^G3-5K_T zbI!$Vs=kxHgFa20wZM&HaWAkyCmg@e%k92kJHwbWj4R{7Kqlk;P$ozzf<~F_mV%z! z`-)+lM=vva}yJl~i*R&S9=S)>c%9xdRtEdQ@{{uwKo`ZCKh|D=Y)nVQRg z0{H@lqRXE}g(Ib{qlxEVieAy}x@n+)NiJAqC3`e~h${@Dh$Sgtl2t}0?#XtHdG!+I zvAi6AWv*t~r_IFLFwYBB{UdzPNS|4iszisO}rL_O|wn%={LmO)wdz%~8f4Q?K zztIC(>6;oun1w)4D}5WtzbOB%Q3zBYYX9F1=kFebzM-M9rSbK1vHrKg`CW@Q55h(t zY;5`ed-_jd!b%?uGBJigne2>AdZT;6J}fxCdwhPzeG>xAsHk?dJ+zQhErg^6(su-Rm zlI@%O#y!0|G{VIjVfU2jFUIK41U-$SzeMswMq&wK-4&`}TGj^6b z^%{qLGWqnrFog9_%mv=fUC~)+EZNoanRksfbJ;a?u^;Z(pQnt`OJ>`Z-K+;VU#)#- z>c4upR9xX8Vkyv&8avx8$v}!Lh`wOd;U4f-nq6CLu)YfhS|6d zvZHE1>t#-y#$#0s`hiEiCzJx2Q***0FR29vExmL=QBgtEqF&jI=kt!VQfw=gWLBCL z7 zI&I*L=J@QPG)O0*Zapgmz5^Nf^-5*d^pf^MI}mdWA%p4wRpQcJDw_K0g^B6?tfbBD zQcB&jB!0-9u*P@fy-KFxe%bKd+SE(|O7I31;7^CSF zfz$Vki%`uifrNWCFwQbj*5KOD?rb4zx`^1xZEGtpE?~bf2f)pZ)XP!+P-YuV)1Bs z%uXsfu?yMf>KI6~(~f71@rtqpEemcEK!HBs8uT*7ddu0PGzpxe@q7&PKN|6@hQA2?bfRW4G3u%2bS}-oSl6 z+`@(|qf|>4m8UFzW|*>#U`P3?c#I2hqDI8LaKJ+Sj$!WHbqvop0}* z=}8*wMP(%pQb&z`G6Rv{UTi5g3BQwY&Obk8m(4&!5J~Ickh?kz0@$TEy!xwA4|^99MK0-})wV-HGjyxlb3cfq!4m^baMHg%zZf46<} zI?+zY4370WjmQ{jts<(#*4=DY;2GSt1I?=;XNnh0R-pg3{+OdFdC>r)>oVN(xb4{& zIsx%r9~&L&MdpagD{szHU+n4!1n%d8aV)aQr~~eCnxRHy>qnz0&4sm z5wFah>}Gdyg{V9(aa;5E3HVgFr!)JzwSXqNcvUXUM^-^ybh1Q!M%w_&D1*mCFLb!C zT-}#-xK$BpOwI)IuUE-ClYFy>dRLxPo?`e8yMlR%`oyv!VRniAiZKzcO#8{ERL}iZ zIbDhZ*q&=l%GqL9~>m;o$!@- zF~aXx36$dM4i0P`c^af4-;2_Xd}|lhK$_bXm$x#``8Zj7x+cJ+LCnA=6=d^LId}1k z(mVeuOYhHhG)+WvZ`Ma%%{5Q2#Y*^x<&<#1O429wxUi?<){<3o>5&z;r)FA?(_B_3 zh4${XIeJS`Fmm%_bOtU4+onPz(Q1OwG2$(id*aP9l6OZhKH*a>3#s^vw9^3eXg)~U zKgS=_g~6(Q;S_fP8G3yqK4pHHw$(A)sUxdp?HB!g*kxaczG?g zcK!ulT2GOj>-Y6Me$_&H7Hh&~wGhw(BO;c4sl;3i1D}g3VFe-0zO39!c~}st#l1)S zI&JE7kS0bbs;xpv2)_qR)YnhDe7c9741$VrjAHGtk?O8W z=d7UVpx`%c=|}4rkXD%W%)rP&I-hHTCemvqoXbs7=^9GCr2afzw@pV7M^;=kZ9ck* z=mN42Zw50AByuXNYJ2Ez5!**JH@tWhW00m05~#m6ueO6-SbXjc0Z1H%p)Kkt~$ z(-0khX*7EUKVK+0Zl!d7l00zUuh;8?cP7xN1(6IJDtrq&fl-kZ6ykg*xl^fYD}9Ud zM6V&7BG@@Nq28c~AUG+e;*1Ka>L$fNl))=}x=bWzo0ZH?5|*lUvYGFZTJ6m}nx=wt ziI29Ay~cLQZ*+XFEXVSZ87qk|xLFu9=S{Zy)J>>UsTbF-*i)UC{IcS0_Q&cQhilpXw{ zeWFsV``F~hoB#kF^WbE;$RKM3PAEWiRuS)j8E>N$(EW_=Rg7V=-@?)byY~@9qJZ+! z8Hf6KTJy?_)#;2|+&{YBe&M4YlK5j{tf8=OKkp#+l{Pnop{!=Vd{+8SO*H(rS$*gC zBY9BL<7_Jz@RZOc2(ZKf&I}f>+>?&To_G$B$}(8;=5nR9aBuF93K~W>pJ&Ot^F~c> z#y8+;r!Rx*xn9vC%2Wdh-4q2kPfKxK)@G5hw(q82)BI4O#K(bF36x&EbqP70ybr5|nerfBD8gJ-p!1{2CtaOQ z54^76`O`!k9~~WSZEcrccqX+)!~jM;mVsNV8aJ+i^UkRMQZC7*&q0opoJo?!%1SnA zzq!RbIT$R`eE_AnV_AE|olz(7u^y)xt3@hG5FJ;Ln{FA5K7tdGz;Xt&WA+qPk_O*48ReJ>N+5pXkZB9gWJ zYsc9wx6;Js7ohTlobcu?$ZnOH5VeMH*I)*lNC>(hMH%oS`t)*VLg;2WkP^9^%ELx>r~`>?mif?+iL?N#p- zzNi_ozm0^D)je7!1Rk)Sf&Szj;9G7AZ7XgC3on%Ro?%6qVOzC7$s|HTW2<~Hus?$a zKq^X?IL_e4B2-aLVq@v8C1j%hg+jmPurcQ{#LKW*=RE-aA)hW(X-H}eZ3d})Y9!75 zimm4yV?cTR8B08n!O>=cF9VJn=8b{QJqf118Yq?KU<47M6BsO)zY%Z5V3w%e}>?6ZjqG^ymBZGIgy97MVm(zPaQ+p`_mi&;Zdg01xOs5jAI^Ik$w9V2McgUPvLgt+Hrv&WxHv!n>3QNwc+%qnHL|j~^Yi z&a0zNxLB21OdK!fe>bY%F&}2AK*Q_akc)Lp02{a4iEJpBNoH=6OPc50_nfM(MW;F? zViwQQxGrJvF_XHRPZ?E6y?|sJ*an z$E8;Wv6q8dCOEBcJ-%g@Z(fIDmVnhd>F|O|=EG?Ma$0zwVqiLy_ZAMmbOC8t?UkUh zHr@*|=ayZs2h;iYJ`U(+M|plI0PIIi-8P{Y49;i_blMx7VCrN|t{{|smcVvT#i^3^X7;2eG>qHd_^pAvt)-v296 z{X_G|UW1=k{z+8*Ja5PTPx9%nwAEkl|EK1kI}CoFU;YN^Cyn(#MPa}9pZ)nsYyD4A zK)*rxNpt-($_)ebQ__Be@+4*H0P$4bFeiVE+vE*G>`K|IY#Xl@|MFoEu{7 zr@a0R&R=P=e@6OikNkgw^eb)lC(aKe{*@E@s}VO{781WBH!C$hq8_wvb zT;lwNLHfJmuc7oNnfoaz*RJztYWH`&U%l^5p#3R4*JF9TTj!6k`**Ehy~mAb{we&| c1oChFw45~Rb)-i^!oB_&Upt+OKkVoK0Yt*vY5)KL literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_spaces.xls b/pandas/tests/io/data/excel/test_spaces.xls new file mode 100644 index 0000000000000000000000000000000000000000..316db172360d0d7b849e640136da4a091fbab64e GIT binary patch literal 5632 zcmeHLZ)jUp6hALb+Sj#f(ypDlx%#Xw>((lE6P(Cwwr<)9V`ZxdGSD=AZL_`pA<1A0 zVm3EHa59q(bJh)l!*<8z`H(dPu~FGO@XAHk*S{T%jAt z0<+#+=|ex97y#4fRbXP(UVZ~DX=SQqSm1z+;+GOj_Q(kGgggb0mriT3WAfFNBAFjA zO6k8;pB4S{7FfzwY|pnp<2U1Y1-S71GuHF`uL9SAYr(6*b>JWvSAf)m8^CM8jo>Em zTJTNao58n$*MZlAZv}4vQ~zyV&T3FKjuf<_-*UA!`)g%_ z_v^#oTNq!h0vC>|`o7>Y!I@D4w+NH<+-aDcl(M zGFl|ZqA@#(T8EWNMB`3;WYEHmVs%)K9qmUA*TdFaH_IA6>ZIahj%`_~_^_SK;`T8m zg1PhdiJ47c6g=W>L-O&8azj~YK`5)mlKWu_9{ze=eh9-|f9uM3`}%r1pHMsvBu5+R z6xioDTBiJ4RH9AO6WX^C2NylUUE!Tgm=SCh?&^sS+E%#dIqh~*(=dkIrgXV;Q(n0h zB{0 z2Yn54K|gdO+N(#?H_+@y`v#7ycOw*4>4(uEj>6#lroj>n-p>@{Ml6R6yoM?uMEQ_O zTj0LaR374+Xa}RRBZj!<(vL`di?J7p|h`I{q5J@*$pgEfE@zH=u9Q#5SX%DTQMBX z%SOxj{ipLU*u#rWlqY?Xjs`u3ug`=Wkv{znJ>r68qz$kZ?ZthRJA$Mrs>xf;X=h85vsH@pcmzEH*(23t_q<|ch zqgekiUcLPG-;4hO45&|tr0^OZ1yY<^J^hTMrLO;W^!>>jG9bI5A*OS%^*<2h&1d`H XSpP=oPI>y5BP-Yb`QP9Fwf;W=%ZE3X literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_spaces.xlsb b/pandas/tests/io/data/excel/test_spaces.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..e38b6c2d8f17061f1bf5486d18aaea6013639578 GIT binary patch literal 8036 zcmeHMg{B_*Vm?h+(KX^<}IuBGcPEue(JqU1`6fP|EEsz?Yd-6b8;NXLRm z{Z_sAs_5@8_}?)2Kt(?P-}XQ3fjF%$)n;CTT83Tx*b-LH zE)=@qN-S0)vOF0=o6#8o4eH6g8m2Q}!8K|1s(eUz{1|@)|4fgqrY`fi#QVYVh%3hd zOlI*z*~Zoz4(G{Y<>*JKok<5Xaf)*6 zT8@Rw*K*W3c&UH^?Kq(31YJ!Gp(==ZK$N@%q0KQGc`g0UtZa?fdz^97K-sn3C`S`@ zTTgD$u>~>KFY5O4);Hi@`mt&nl)DQ}uc|^ajYA{ru2w(vp=wkyAI0!&tkb{Z!9&qh zmF}|9A3jU~nplbPVj1)n4xd``$(InVD$Xpr%rp|0eBfSxD9ua7r zNxN@4$AkhA0`@XokqtN}q^>EK%uR!W;6zdWKHS+^^eS2p8=r~>T3g**<)6qrlZ|+Y zYMRQ5i{0Adom!*g=I*q-A|AD>s~@M`MJGBUB8y^J;|}>Sn53kD?OuC*-k~vWCJgV-?>|3?4^uR z2y)QgFq9WG&AN_Dor@c>`Z^^_euj*Ka_9EIvS;Q+NlQ2}h2kRvRMTY<4UmJt)ROY3`CBd z>jemY&*UMZx1OL_z1wWA-OX8XR#b<{<-F-?PjA6Z+=9wtg9Ux3(~Xvn>muB)vi1qL z^s+-meSw+R!(N_>d*!Wvm=jDoqPb%l69czQqye#0Q(9q|3i~?TY46%AU|JGs31UIu zbmo)lcdiNHx4>wk=(alO_Bz_so-7Kmo#QS#EATJ6%bYavrB0q2E^oTl43sO zG|XDQE0(Ma*Cv}=iRNofAzH*st%h%{=ZQ(89N-ySZx5<+P zp}crAgx7iWp3Jun zzVw1Tao`UP{-QVn2iVNVs9ipuY7m2o4b!A*zGGVU&7ugN=uT+ z>Tcn66uE0+uZ>_U`pJ~)DCZMF#m%$?(y{4qJ$La74}|>uqCCYUo=TCB$cFneGs2}b zsFM8q99()++@k4=3snG?4ERmbE^;w@rna(dA8&IwYGf#D3#TX(AT-3G9T2Ctr8?*k z3^m>g8oOsAfvP*_fVWZxBNy6lE2k<=Z;Elfm3bXFtN%KoP0+syA=W73T|VUjG1jLL z^K!APbTgN;`-F&SGF40Hh=~5I1FKQmuq~W4hpTiIl!={Kdke|BWP2pWb2(tF$K{pL z7RvfK7_=3?Iab$>d(Bd1pf(a@LFvdY*n&qJDp@c+0QHHwE4r6mI<1YP(~waQ7qAS4 z3!b^o3%mLB(O~yz4+pvQTrh~|r--ViYDF=KA^rEeXB)`~>I1O=fL72~Y5y&O-EF~6 zV4iPBzOT}LL%-(<&_trUOqSVJ@nnwW+~>{b|FL4AzY% zzC#HyLae-nyPDkionXEYb4KS?m9GdaxW2^s+CV-1l5ARgSw25IxbyhY zBc59iWG{}F428N@LByGm=t4T7!H3U`n0ZBu)D5sARGYk@7|cZjgbpN z{I3V(+|#q`Z22%=hzQ1W2pGH@wk7GNUSi-a{m?(mO-PFtSR$E!ZiXd(9ntuh)FfEb z?@chi@8-<=4<(Xp^<}{>IyJ%Rc59>QP}rh7mN5<`;@rAQ#Hx2SgLvj@utr4!8;?4{ z6(9F}wq?%P6ALamH40i32^2OMv9joauVrwEd5!Y5<>GIh z>b{T=w~Nk1i@>vM0VY9H=~~)|@2Mt~FU(>T)uZ^Fd_g=<6tD9+-hZxJGE;l1EBIOm z+(DgbW!`zVJ3V`G)b4sPY$!vflwvxDZz zl8eL1Bhy=zULv>nJ`xjIew=PwnPDJpMx{R8#c4=^GfKf>5`?Rsc{H8vs57PvpPCS* zPE9OS1!= zFQryv;B$#pPe5-b%6oQ+OWN`rM$r7M8O#kqpFrY`!#5VFD`Bjj`6=m1$(Z6&Hz2W8 zCC9_?Du z3iQM$`NAOtvNO`m#yghQ+XRTK4|I*W}}Qw9bMxeEKmnvycz1e(WuPxFH%qq(@AlfB%Qi_yMX#!+{C@aZ_85+`P;0Wk>* zzhuAJn?Ww9-WHFHmhp%MNAA?rOl1gVI$!yh)Feh)#|PCDX*&%=hr*uCZFb&MkBj0w zI&^GI6-B1(T@OE=V@)Qb7%<`UX~CvbJf=sCyl$Rf`0G}<@R%OmIJp(IZqgb?vrv=R zp{gUM%5$%hr+++eDC!{Y?u%o%Z?xa?qrozxdwz@u`OyKU0u_XK>>~bC;rk5l!RXPi zx*O3+of~5lc7>n5Af$+7sC<-_@v0<_Qzqv=T`d!_tHsN()S7E)#q*YeN@}+Xv^`!u z)6ogE;xob7|4TR<##S@Hsk#k8ByUQ+D0jwY9q)?aWv);dre;ijE(vDz&jHH0#q@KW zuyRgZCFRO~u<(kSRD$aW_jK`aF6@3AJ{kcEop40(p1>tKJAl&FV`2V-OyCJl_iO{YZrt*(LL%W;SxQkQjPe?XCk$MjVJ&PU>LnqW>OP6 z)xOaM1Byt{D_$vns+xSof}Zb)HC(jb;K3GV@+wv4TWyCJg7eD7i5Q)8$Hn(+)lYZ* zqH+h_-|=N~rq)Y^s=0;i8*)AHD1}*ZxE={7r~0zEx6smImzC;kW2O+IkYSAsC-b8U znfuUl%)1oug6&3Y9}n`yMKRPwsOxBSTctI5u=lO4zMjHtHLSMg%`=A;5~Oiz1>TIb z*B+|1e@S(yGmQ^q>!MJo$V0kbLwhK-rI11pY}S>a%dv-95E)LF`|O&S%Jd}L>$(mP zLw_j~ruTDiJIP%#?Qb(F0&?nQVFm3EGg=NXF%0#5-|Pe^bW)`bM^UvDIgVz0RK zdS`dyLnbwV()%i=`CLUM-93g*o_i#)HbBkPwRov_M7^FOL7}#VzAyo&_oP0|6-6j& zGmoD}UDOhJ20!51NMtW^)x-THnLUjA4xuT8w?1O7SA=)JicDS4ibS{hpymO(F1dTE z@x=OW>XQ?1HzhaA&iNZdWcy$hBhx8ehZCNyJE#gPK3RhHqmy-V`NrE(ijC=`>{Ta` zCU*U+LReXn;*M2!dHlAOgM*#xXaz!xG_xkQ6NBVr2VAO@deU{)F?*aH$lXn?obU>7 ztI$5cER`WoW-%P(H^PQ zG3QRwx6Ry5Qx14rq4&njV;t6KBr4+J0tt~|#l1s>)8^Q2;%D)U^U4tIY1Svvj_D2t z>TtnZeHDS*U8C!wtZeO9N87E;SOSM!z1R9GE%KT4{fM-$k)kQFtw}qS0Wsp3ndfVs zSzOh#@2*g_EfIbyR=xSWHyRU9E$4b_;O&~L!(cVU9<8YN^DKD2fFLGyZpY+>R$bAH zG`?*H(TP_fJ~Mw0PZFV9W2Dg%It zH7A0pSossR8oj60^A<|4j*Nq}M8_gWPcw#Q{naC8xj=3gwjIW}INSKBS6FoboHd9;A&#%DYSK66~YFpqA&QNQbZ9WE_+e;WDw-g}c@kRyg6bu9V!k^dCC zA5pWnvP$Y^eEuWXgiDS#7~qt5$LEn(-EbZLvVAD17N=m|3{; zX_Qcr455M-_dcy4sD?jqi*U1+R za{qgJ^>?l#wZA#UN5R<&?ENj*2VrIaR18o8Dql=MXyBa~AzJpP_JDvIi83uv8W64< zn3NbB*mw<^dxH;&^@qRPy7jz78A&S>WOxXXT>vW=OLd5gs~eA{3k3Xkm;1lTKhkxN zV1`P~yyT(V7#GqV{?BPrKJ!CVYWRiueFzig{a7J0IIKdW^~yL~KQ zJvADCgfRh?TV}fq$$_cvbxg(8ddJ(TKN9+%_SU{Hm9WGZgdROM>9)2~ZRNF9@+ zB9F)~KzUCQifbS9VU88Lqc$2;-17n})apZF$Yz}Hkfc}dvLjyZ;S4l?tlFucIXHAb zdS`NIp=(FWwEck&o_g$Ga!Hu>l*YFO(!rrNqGt!}_8U(r599z1n~$sV*tc^QD9%$w z#^%I`0)2Z!BevSCQm3X9Ike#BRA0)W#CuZe&14HDlU_}c%j>9 zhqQRplp@pWc!7nP@}dW&7AP=J8{2y^=)09cK!7oNac^ipP0 z%Eq;E^L;AwaxX!y50B-LFz{%g-h)8iBylP75t?LPM!n*8s0G9banO+CQHe%dzY!H4 zSXHCcA?g4%&3}>7Lue`8^0bUw9bWA%thb}s3w|u}+HAHwD~mfmCD+~|GVEq_f8;K< zVjA!hVRvbLvHnV;=9C|Kih5oK!f`WMjSYlZ+R zCVk#(ZLY&BCmFF+=};devBy_&|a`VDo9%zm~? zI|w_v<=OFwL1-q+g}Zj%ZKB0j+SOnMtl?z%G-H`o7}1A;Xn)@QfLH%D^G1f4-MKTK z&CZSK8j~5$PBo^B^}?LR<5vh5WH`Q8XcSZ~WHtWtlHos5`{(?J1w?hFUjcqyx%kKM z>$wI=ji1&rE*t*Z%Kh1J9`joT`F|R_mvJt)_I@B;$N9Gs{nHx0Y<#(4^}{$2_nYyL z_SI$6%N>~?rjx*LrkDFOmjNyd#2)~W$PUl1CG@*5eh80mE$+*Jm-W^Uz-XlWLIVD# zzAhtN);&KE6s}z!(vQJhM!Bp8exUe}UPigB4K8E-TIzow0RXE=tltX%W%FM@(?6R# eAf?Q|=6`>*)s--isRsbyA%E1!?B%2Sdi8&y2U)}b literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_spaces.xlsm b/pandas/tests/io/data/excel/test_spaces.xlsm new file mode 100644 index 0000000000000000000000000000000000000000..a41ebe5bb0e655ed2c39c52a43baadeed592ae94 GIT binary patch literal 4848 zcmaJ_1yt1Ewx+u~q`Om6MWj2W5d?;Y!660d6b6xop-TZt5u`z8KoF2_q>+%6PT!3G z`z}1+z3$&y6KnnUIqU5G?frfGXse;3ks)DWVIg60hiD_+GF^ay(~0cM)q4c@madRq{hI(tG6zilXGul41U9z7tMG z0g+~6tpRzx$GJ3`P0j+|XMUPkoa3g|3sKLq>h){DX>FeO8X`eiGNfx^71!>P^;n8= z*UbqU8@={p?L!>)GDzBLm{_)@As-ju$CgGxg8%<-fgyyy;RD|T2y(Il0-gAMT%2R| zW@WkCig_R_C=b%pE9P%cjJ}P1znz~DRkZ@;}+BG50W+}4EKC1 zo%^_9wu={HmAT)&ifOgUxN1uz6C93PUGVlg43Lb}WN<=CEZp;qBC7;{h356@u2c=3 zRbzzHMIraZPtK@{rj(}?z22L=yR!f0tj{Sw%Pn0b@JRLI-M(-(!z&HdDIuPQ4-aDv z&JWox2qblt%v8UV-(kwcTz;@hh!vZp0_dKSqDRl%+Vydgf|zoTh@YhX`tnT9lGL0s8zGG2VP3Iq{A-TDhs+!aP)00MBZ+0D zn4M;B701S3MeJO{s96D-Non4q%(gOfS_KZm8H%CybgwX;)@R{80Cy$IHfre6n7vo| zzM++nEN4C_u(56`dKfz^*U&WRIFJP!Up&qXHIue|z~=qyRqi~l^${)tIm2%~UtA$0 zA$g+y4LSJ#AjjR~xii2Wftzh3Ev*^BJ1x6Ky?4FV2B^)yXZ6}vMw~DT(~7 zz8JAYf1aPcLZeQqiHq9-NltbuFGc0BDkm8)W2`T_WJZ8J(1y%nrk7L{)rfo z;S_5B1UpX(s;4$}Su~|ect&Qvx%=jeRy;>6!bB5b=!vy0Nbn#cJ*%xeH2r@c^FG+; zi$gVPCBIenIUeyue~|liPCpuSxk;vv@az6i#plz<)!Q%D*BHv%LPh)s6(s8!kE%8I zkrZ}HC86`*vN8x0nk+QGu(>87=N6v+sI1FS@CpX&q^;B4duk?90GO!j~0Qw#9DcQgB(~J)+L`)W>p=4OG;u!ez%F zH0$@OLcXXyJ~&3t>2b@{d@t%iX~#vG^r@P}fiq?QYGo&2=Uy|^0)@bKk3OA*FfObn zzV8n7`SPyG9!FkKnSpruzSg}m?Eq%6sMAH*d~UUfeKimEA}LuNuEUfUDhUYpMZN1c zBNY`f;1~sfGDz+djRPH;YUH{7Jn0gk>`r;YBdcxQvWKc4`z5|)?J|%B2GA<)MvbxC zB`BMKF`)thM=yus&tP*^`-OwgXhzDUxlCb(b|9V474iXuuVx_)dSg~mEQ;gzKNwL! zpBd4l2R@Rd@UJynSiGnfjJuxI&NHQt3I&GDB)DlP)?`fCh~<2hN-1W~X&2HQ86hqT zcx!TpkMDxYJ|N`2^3Ri?2(d3j$ElaN4^v2rx@71Y62`icEAb)XT~=evxai#IYq|Vy zjwE@Gv$TiXw6EtU#SIyiys$)y@O?ubS;c)1Hcly`A&ftcc{CSPtu_Yyg(kOmP59u& zdCj+>H|A2_W#E(C&K=rscFUqu(m{tT>U32b$-x=}K^pw1O%1|T86uV=z3r}*cw5JK zll=h88XnY)EaavZJm&y_<$;K04XI_%0&aP&wwJk|LX7G#R>IN~J+GGROcUjaMvM@X z=7RU?70T?ggwyyAE#zls@+7c*kr8RxmqmtI`|HppXjyW8fb#bAJh|D$Y(AT7I@^*vyu^c|oW!(eGCl0Vlbal=IMSk> zV}n2IdAu!@84aO{nCXt+o=dRiGdp5`71KUmu*SKR&sog;<^1%1VZrnMGvLLwzAYDs zeurpkpsX~|{A|r1a_F^2lK+vPz;neSr^P}#on4MqbgRO9iV#uIV>hEU^XC@q+2X1gNZvJ*P6kn` z2$<9yNEq{5b-b1~FYHm(ESczO4Bgfd=>4fOl8M|;u2*h~`oWb)yUD%dSC#byK7mDn zUyfimK6XEdK3-%mZK3tReKUtA#eOu2Tz z)+Ot~&yVQ}<&|^aUZ5Z%qqTSDI}ATJ+g~Lf*}sU|{wWAxqvru~aJBo**Rk>aj_{a# z-u1)DyVxuQ3r8>0WB7$^BZCXrFarB^zZ@Tr;Dl!>hVdr^BwQ;W|4b$|D{(IabnFW< zYGZr80;KOGh8FaOfv~dPQYl{8jJ8&rMW8&1^6Eh=X94hFL?v zdIm&j_5>*3Pr(n~3)eCKUH!r-0?7twtqlUYxeHhWL4e=-SGQLaDoE6PKpwyQ^1%MN zyaqRNDN#O&u4!;RwOs}c$>;aq-dqU<1TjEWXhw$V>$i8ncHUn1X@I>QQ$hM=OV~K~ z7$n;rl7+H4?1ggbC%SbYdtfs1NVM&TdCz=>@|oWxhuIlpzlH>Zz(j^ZEu?NK)1fIXkNZe|G#+mj7eUTYFQxn;wFc&08IN@yZrIBx ztFsZ|r(2^fRhY*$%X^<3V5`c>9Gc5`K<(*gVMC1c5kQv9HGh`L&SZ6lwl-F8dL(^l z&1ytKaDD=|OGSF!g3hNE^u@F3vIpop{I-0ZDUbpR`Hd3EUkS2aP125@(I%;@t;c~G zhWl=AmNk_2cCV|R_6Mp%>q;~-&c@EKP>Xl^reHA021+A{A^URZ5EGHGA0o0~5IkAC zf5kG$Mi5wmWw($UYrHBzhllc0QY;k|wW{6Z68YDE~lz9a)}qI5RMl zyZv1J{{8*L!%45|Hxjby(@oj4hIhHzRVqJ7AFlZOa7`2{b7qosNw4=%lT7`TKBn*w zilWht$L2p;iJ}rqVte4K^KnxiAx($kJQ8P z3|{7sl2c_)isBaU2*=eRI}4%&i}>y$Tc4` z&=@Q8n(psCZP@9m;B&?3y6m7-E_7+}h1BW*pzD0#At%R6T11>EK?BR1@F@8H71*Tk zH%#S#t{wna4-5V0ZUA@l-^xb6U#mlqtnI*%w+T|a@QDh#kn5vQvhVD8Qup{gxo+Wf z*Pl)m>uYelkIDE+fW>YRofa)y;#~;=@YfMy>NnW-t~hz`?)g8NwVT-R5T+6kqj(Y0 zGQH+N7t)e9dO$;V05cji;)>PdU8sCl40zGQotp|>Bwtm;9Av_cXUPKZ=GyR3Cm4r* zHZ>1>+p}FKpt{OcV}SD&CB>m^SThrK!!O9m{7!xEgDhhCy8EpZ;}(}?mY$xBjQZ*n z15Lz0ibf_EWT7|-R}LKI?|tEzZ>oY@VXR4+l3J;{EE`SFdJU~p3In%vG|up72og?d z7>PRTMMV0auvx}p`^)!4#-as=e3j+aO9_#CdaBA4=JO+nyc|8;z+{+IxDH8cW?;RK z-Rn~6g)?Q7%zmKB`^xmtB`%5P9=rMW2i`e^uz{><`gtVGHu3#r>3oqG`Gt^miz`jy zl3$rLOXd8zrxo|9Mvhf7U;LiHr@MY+Il|KgKDwl+r0Z#Sj~Cs?@i%{2F?>unZBi02z4#`yQX|4&N)xyo(E zLNNA?En)v&<-hRvpX=Oi?+AXou}ip0Kpf;xw)=C1+x-SX12;yA_a8{%Pxsqph-gYT t)(sErzd7Y!E$UD2+sHta>W%FY{Kvx8RzrtJ83_p+{>Z>PWXA73{|DbzC0_sl literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_spaces.xlsx b/pandas/tests/io/data/excel/test_spaces.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9071543c4739bd5c1d6a886f7f4853de777631d9 GIT binary patch literal 8622 zcmeHsg;!kJ@^#}*a0#x#-8D!^kf4De!D&Khym6-q?iQRtfCP65(!o7=2u`EH-Tl{@ znfJ|1X1>4Rz0+&m)4kTM>U+=GyQ#&u-*aIhE$B z9-|BHLV5_;G^AQP6S|Ac4YJ<6xz7r_#*@t@c;ElE*vg>+NR~Q9R})H#Uy4Ob4Vm)^}(M$S=t#l{~Z}CXw=A$(-rY z!H0D>U>E5Zi1yC5Evo7);EVJZj^nM25`MFO!|G*pyuzu796|3jN8)OfuTH0{#cHP6 z;B=L&`~}SNnSa8>9C3jsI2MXKKLkT<`XWsIT;O3Lw`U7Wbi|4J zRihBGl`*G5*5d#=$k-G>9TZVRQp#sH?9(YtoU1Z$&`vvl&9p>F*$JbtXv~U+#XZp({$}HGt1Mn$D;=Xx$nIkA zrnuM6$eKam-r>7VKfXIu0O0-}384NrTGndwFr2`#rUGXj7MzyGPCy$lH`mYW|LFK% z%)!6BdP)3qwGJNa;QePeA-(4l^Ralc%5E~xn&@8m`YX;osEx{Hq?m7cM2Yv}VIa~| zKdA3b|LnX()K)Lu*|I=M1ObsaL!Db`aPpO-3mO}<(>n#nlBG_3*NM}Kv(%?b9xSfV z7`Eci1v$!nFb0M3L-|U)VXl`CF$nW1gGnS)1N6I}>n)gFlp##WYVDQ=SJVq&X@ryH zApjl0-G=*jp19dNfz0gfK|izCe>j5xPhoH`|J|cR^SN>-4;~cjE`-M|-IW-B+L@bv zM|=ALdS4CeEHfRy?+J{8twA@YG)+#_v@zvBx+pieVJ zt!CcqzPPkvbIxN|uj3pdTk(S#D0G*TzD$0s>@~32aGqu0dOrDlcg`_C$VN-vRbus^ zE;irFUenpvby1;rG`N-CEtkGKNeQ*d(hdXSdF66Y0$LA4&<|3$ddSU7*r!@G+Ux>Y%4i? z%7emVS_M_+&oT{qKO%*KG_b(Z{`8+%<2%xI*jx<*XF&uaAQU(B2Zz~*J(;a1cEt)r zO4bbm;(#dOJlLh0gi3EQUJ0yc(fk<^yYC|#-1?Fi7m7MdB<$X(3=7Gl=+8hSN{DUO zjQSO8KXRe-`309<3XQnijm9OFsYP*G{I*A$WJVvYFy$JS%b8!mfY5?AB?%h$C1`0z zOp?|S(>4fCjV7_s46{8b0aMJ6CL);z*Eh{k4kYg9?s(Dff4DLED5Z& z3F=7i^uwup+q_lZWHpRRq&GOL@(W8HSlW@jmHEM46{)%sK;YZ!tuRMqHm+F{qKmMVcCgm4jz z@m6~u#u)G3qWl|ATuFuSmw)mD9x{o3=Ly)#)Cp*z5YA%U#tfgG@A&Zpu2L0f(v^YsaFa%35x>jVzjS4X- zC%RGv7X}+r<3{JmiKLs5#cc^aT&1DmsP-QsaTST+cxvvR!iu&q5fn~OQ@cj1U)Ogq z#u6z(sHkcQb3sdx<3N`)j1ZBaEn)I4JMk=su5FaM)4j`K26Y;MLAQ15z1gChM$?et z>^5bxP~nm1hlgm61@S2}K_ln1oSk{^mW;hO=nARU?78>%-4Kq_7`eg#j6YgrBYlVb zkp68=+DP%fsG_zC(GJPCXPKu!hxQ|U@+Ef(u_VDoMI?l zV@$VE(s#v)czOr@^yxgy*Ynu?)>yCntHruSn zXwd`pz4DEb3SRY`Zr|#+)Qx>Yuo8eIeXN@j+fK5JHN{cCn#gH-xvxDyvM^$7SlvKX z6caR={lc#zmAjP2(RPe%S$pe3Dc#}Jaaee>{F?`Vb7irrsd9( z^twCPJPv7*^gG$O>@cossAW^OLu1m?+M9?mzQ4UCT`+FBJJ>oD-u(9Hu)VY8=5ipp z<^E9Yv|@RVnEQ**(a~lCDa_||T`h)cW|opF<$x}%@}O+Ve4jbOGywPd5J5V#lSj5Q z{xSZBYr)#cum7rEWJ$vqTGMx{>CvfiXI@T%i(W&OFkU%uQ^uC~e^2 zz#&UzKH<0_tSM|H4z$OmR023(=g)q*tT`Q30sMgzP7wA{q-{*ArBW`Es<=rmbZt_S zZ&{@x>|_tGJTP>eNzTcHNt3Ha&Mb7G9Z36j=4f!iw!EVNG$HA2!C(r14j$jo!nN)J zD=v*4O;LGXL8>ni7OQ+ter->g%eWckJ;Pa#ugp2#D*q4*IejS8f{DF?drfWFDHN~l zv@zBYLB<~^={gmqE&-lViSO<7WyYm@66Y4ENa`IkEiNx9N2y-LJ6DfU*M4hB{~2Y~ zIxX>0=oN5?sCe1XFX*oxBlX=xkfpter`3Ke; zMqkpKBvI`sm9X~?X`g18AOPcwK;K!NlF`|L=ZA^56|e0bA?_qBVfe7T%ZKvwN`@k9 z8819Dqj8@v3L>u2tVWEpPNHn^#5c7lZbzZ(y-xNd+4DAZVE47J?lCc5klg)(Ozv$> zPJ32=Hi)MIf=HMKqhn%|&O<_p?_SAg7VF#WC8kfM3q3$0QqX}CCagwv4Tn`dx^e5L zc0&N8qQn>+UMGsBvjErS1FA93X*fWhR#|hc4)G=6?;hGmti>BdK!W5GeIn zzx(v;RK_~B**l=0dH-l5ORc$bjh`-YZvdsMN!HB=xp_1aji+E%qw!ILHNp`6MVep= z$6O)<=ZYh<^dr8Rm=SD!IpBP2M!3r93wM6zZ1cPj7k1+JBc<1>$L=vvy?Yr^9O)nXhWqO>hei1;;f^9$R8!_qDUdC(yMJnHia2OUlG`bK*}msm+mPm_w-wo(cSgsR-(3 z-2AjL&RWxrjvm%7Z;GH#@;DhHovWjG#&{s4OB~{Z@2>T_8&gBR@u@YKHBdP5Z$&-8 z2R)&G^f`c;S!<+I-0rg08oi#4K>&0~xIxx2>5kgK@XdhqgVtwPa>XpW3pdku)$9dE-*N5<5AKIsKAw1V$-8*s{O$$CJ`uNqfvrlkSM;EZ*zR*T(QBc%Don zOO+;INvq7^uBZz}Rl+z;*$$FRQ`qeX}LM+7=;4e|!0R}jaoUAirs z^jl3fhmVOSMl*;-sJnhhvB+u{3@iIOs>V)3rZ%0{mb1+zky3NcD$5#6$bYbhFAR>8K_DH%n^Q*qgV~%a5%!{l3x#kH=G`9c=PX|AdPnsL* zvi8XgELUR`y@^655pxN8QRuK{(MO#3zN|KA^GZ{ki4RR%@LmLJ=HS;A2f32z4BS?{ z+HzrguxJve@d|f*`wHQe>Btk;H|0H(Tze~PoP$C55fLiJUrB41u9m;Nn)Y_w$EVG_ zuI1;Sk`6fZWFhEo}3T2plwOT>uV!5WvyEQ`z3{|X40Rp!O)oy zk-nUBsc?7dkgj$*uFf#+9Sna4H5sX%NH&u|+_Uj70VkNuw*H$dclc0VXPU0Q4ZtSh zk4^n_Oy=vkm|S)>lwvA8LGS9*;&GgtvRZJqZm+ZD5Z23$U>RB@8}eP&GnnVjS1C)?w_)1NNr$XkSg<#fauydh z((d^m3WwXo&P`04O`eo4k>7dL-|!gE6Fc%?C0Cm`4aS>falDk_YcMR+>5_=-m^IEt zTf~`c@hd+f;`-i>>CX5yXG$(C=CyI~)ARse>D)<>T7rGLw@pzd8_O$5`b~v6h;RQI^?|A-s6a`xWfzDv=-=^Pc%8bO>=yV>u;C=Q}Dh?Q!c?aUJ zJtnYa^rFbKS;!K=?jN_C@#1t|oDtJNu)*YId%SJ;``0)3u2;@G`c-K8K|B0g(_Bm= zV7>4!LmM&s6x(qGwz|euI%lz?gjQFHEq=+=c{5w6t$2(lV!2Vga?2uV!Xv!40|=fs zT8b0AfpXg7p6adr#~;dEyeFeFqQkO`(8JO?Vzrq>N|<0SBk5T3G6UE@$|-m^*h^Qy z4pdsR^CphdIH4@^>|Zyr^Yd2v!1Kyo;UwEqo^&g#1qa8KD3i8B$HL++5cWbK(1uYw zIm3P+CUeTvn|}VgG}cN`Gz3b9;w^J&uiR&#!f}s6(u_~?yyEcliz-5(f?tp0V!7;n zZf$7p)DeWILT4b!o^yUMxBl>kc1GUMD(f90F?BILW>F*;)Ph{&k=pRxCF^4L{U%{Y zxcvyCu)mjkV&oY|-duFRu*h1pa%OsMdbMN=izW{ajm5Qc;zt>PQ6K?Z9cb{S#|82S zJM19^-R_ORJc&{numCN@*00}s5PLw2=>c+WC*w(Lclr|3R7(?&+9g?I=Y8my?x~cU z*6Wx;C!1$!4h20nl-;T#`%<{XsZkwM80)Z}pz>DJowrzfDv*#>wfY24< z_QO)PL}uhC*`GOjNdCZ=P>4b2r*;jc@(PLjf2k}S4SzMiB+3;i8`l7?OICyYX*p*J%FBLd$S zeR%AtJm0+pdaxh1Zh%U-SNhadqKmWcCu?vm7ju*6c`Z#!zjSVtAPkj%Q~RFxIA>`Y zBl488fb$DAwiweoUy^9O)k6GQE>}{mal32_Lj)*LiH0Bn*sV)rFhQ8F(=+sIi z53*C<+{&(TgWQf#LLkYAmBqhEn;y^W8Yi+zLkHE2=*_brQR(8%0IgR-M+Y=w1Hnrs zJOQiSqRT^7a#ssBw1?86$>8NU)8$NuIwY6?ct}`ERPCz6i>eX*E`{RLM*3;_38mW- zH5qKu!|25<3BmVEG}Ub>;&0{)7EXbEjVLWWSBA%K<|2W~o8M25Ts4f6FO6RdGD(-| z-^1(4-%*T!$O#v_{=GB)pI!UU{4afSb>+W1_ec($V|X@b7(%zZy2f zpY{LW<@nXluVVL~p7!C*|6io5#s1CsA9~=g9)8u!{`9~O4;=8n|5ZKv z)xob7|DO(u;F=`d0r>yb|G%34T3r2U>Iu)4f4urXO08eL{5^yI=>q_mB?kchkxzd$ k|9d3-tGOoSU(ElAi|WcK@ZALfFySA6c*-WD`T6Vr021!&tpET3 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b1502ed3f3c09..99447c03e89af 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -464,6 +464,24 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + def test_reader_spaces(self, read_ext): + # see gh-32207 + basename = "test_spaces" + + actual = pd.read_excel(basename + read_ext) + expected = DataFrame( + { + "testcol": [ + "this is great", + "4 spaces", + "1 trailing ", + " 1 leading", + "2 spaces multiple times", + ] + } + ) + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. From 9c1984c5ce7648eb5a613637791492030801d43a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Apr 2020 01:28:00 +0200 Subject: [PATCH 22/29] PERF: masked ops for reductions (min/max) (#33261) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 41 +++++++++++++ pandas/core/arrays/boolean.py | 8 +-- pandas/core/arrays/integer.py | 7 ++- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/reductions/test_reductions.py | 62 ++++++++++++++------ 6 files changed, 95 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6bb22f4c16aa1..f74182f6a59c0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -276,7 +276,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). +- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 0fb2605b554c2..b3723340cefd6 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -45,3 +45,44 @@ def sum( return np.sum(values[~mask]) else: return np.sum(values, where=~mask) + + +def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): + """ + Reduction for 1D masked array. + + Parameters + ---------- + func : np.min or np.max + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + if values.size: + return func(values) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + else: + subset = values[~mask] + if subset.size: + return func(values[~mask]) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + + +def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.min, values=values, mask=mask, skipna=skipna) + + +def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.max, values=values, mask=mask, skipna=skipna) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 442d4ca8cef6d..e85534def6b97 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -696,8 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name == "sum": - return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + if name in {"sum", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed if self._hasna: @@ -715,9 +716,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if int_result == result: result = int_result - elif name in ["min", "max"] and notna(result): - result = np.bool_(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f5189068d5da1..d47a396bbb14e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -562,8 +562,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name == "sum": - return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + if name in {"sum", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) @@ -582,7 +583,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["min", "max", "prod"]: + elif name == "prod": # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index ee1ec86745246..515013e95c717 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op == "sum": + if op in {"sum", "min", "max"}: assert isinstance(result, np.int64) else: assert isinstance(result, int) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 962b105d1e8fc..8fb035e085d40 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -65,27 +65,58 @@ def test_ops(self, opname, obj): assert result.value == expected @pytest.mark.parametrize("opname", ["max", "min"]) - def test_nanops(self, opname, index_or_series): + @pytest.mark.parametrize( + "dtype, val", + [ + ("object", 2.0), + ("float64", 2.0), + ("datetime64[ns]", datetime(2011, 11, 1)), + ("Int64", 2), + ("boolean", True), + ], + ) + def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - arg_op = "arg" + opname if klass is Index else "idx" + opname - obj = klass([np.nan, 2.0]) - assert getattr(obj, opname)() == 2.0 + if dtype in ["Int64", "boolean"] and klass == pd.Index: + pytest.skip("EAs can't yet be stored in an index") - obj = klass([np.nan]) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + def check_missing(res): + if dtype == "datetime64[ns]": + return res is pd.NaT + elif dtype == "Int64": + return res is pd.NA + else: + return pd.isna(res) - obj = klass([], dtype=object) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + obj = klass([None], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) - obj = klass([pd.NaT, datetime(2011, 11, 1)]) - # check DatetimeIndex monotonic path - assert getattr(obj, opname)() == datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT + obj = klass([], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) + + if dtype == "object": + # generic test with object only works for empty / all NaN + return + + obj = klass([None, val], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + obj = klass([None, val, None], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + + @pytest.mark.parametrize("opname", ["max", "min"]) + def test_nanargminmax(self, opname, index_or_series): + # GH#7261 + klass = index_or_series + arg_op = "arg" + opname if klass is Index else "idx" + opname + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: @@ -95,9 +126,6 @@ def test_nanops(self, opname, index_or_series): obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) # check DatetimeIndex non-monotonic path - assert getattr(obj, opname)(), datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT - assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: From efce8fcee2de9c0ebd18700033bd61eb61c7302f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Apr 2020 17:14:46 -0700 Subject: [PATCH 23/29] REF: do concat on values, avoid blocks --- pandas/core/internals/concat.py | 5 ++++- pandas/core/internals/managers.py | 2 ++ pandas/tests/extension/test_external_block.py | 6 ------ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 720e6799a3bf3..3f06d80714623 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,7 @@ # TODO: Needs a better name; too many modules are already called "concat" from collections import defaultdict import copy +from typing import List import numpy as np @@ -419,13 +420,15 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) -def _is_uniform_join_units(join_units) -> bool: +def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can be concatenated using Block.concat_same_type instead of the generic _concatenate_join_units (which uses `concat_compat`). """ + # TODO: require dtype match in addition to same type? e.g. DatetimeTZBlock + # cannot necessarily join return ( # all blocks need to have the same type all(type(ju.block) is type(join_units[0].block) for ju in join_units) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b0363dd21f616..2caab9f91cb50 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -20,6 +20,7 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, + is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -42,6 +43,7 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, + _block_shape, _extend_blocks, _safe_reshape, get_block_type, diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 9925fd51561ae..1843126898f3d 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -32,12 +32,6 @@ def df(): return pd.DataFrame(block_manager) -def test_concat_dataframe(df): - # GH17728 - res = pd.concat([df, df]) - assert isinstance(res._mgr.blocks[1], CustomBlock) - - def test_concat_axis1(df): # GH17954 df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]}) From 362e86c4a1ae46748e56112fb68c4d15de1570dc Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 6 Apr 2020 19:18:57 -0500 Subject: [PATCH 24/29] CLN: Clean nanops.get_corr_func (#33244) --- pandas/core/nanops.py | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 822ab775e7e46..9494248a423a8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1332,30 +1332,33 @@ def nancorr( def get_corr_func(method): - if method in ["kendall", "spearman"]: - from scipy.stats import kendalltau, spearmanr - elif method in ["pearson"]: - pass - elif callable(method): - return method - else: - raise ValueError( - f"Unknown method '{method}', expected one of 'kendall', 'spearman'" - ) + if method == "kendall": + from scipy.stats import kendalltau + + def func(a, b): + return kendalltau(a, b)[0] - def _pearson(a, b): - return np.corrcoef(a, b)[0, 1] + return func + elif method == "spearman": + from scipy.stats import spearmanr - def _kendall(a, b): - # kendallttau returns a tuple of the tau statistic and pvalue - rs = kendalltau(a, b) - return rs[0] + def func(a, b): + return spearmanr(a, b)[0] - def _spearman(a, b): - return spearmanr(a, b)[0] + return func + elif method == "pearson": - _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman} - return _cor_methods[method] + def func(a, b): + return np.corrcoef(a, b)[0, 1] + + return func + elif callable(method): + return method + + raise ValueError( + f"Unknown method '{method}', expected one of " + "'kendall', 'spearman', 'pearson', or callable" + ) @disallow("M8", "m8") From 3ad2110d455f3519b5b1b6259df070809314eca9 Mon Sep 17 00:00:00 2001 From: Bharat Raghunathan Date: Tue, 7 Apr 2020 05:53:20 +0530 Subject: [PATCH 25/29] [DOC]: Mention default behaviour of index_col in readcsv (#32977) --- doc/source/user_guide/io.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a4cc1f9ee02ca..d721e00a0a0b6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -109,6 +109,11 @@ index_col : int, str, sequence of int / str, or False, default ``None`` Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g. when you have a malformed file with delimiters at the end of each line. + + The default value of ``None`` instructs pandas to guess. If the number of + fields in the column header row is equal to the number of fields in the body + of the data file, then a default index is used. If it is one larger, then + the first field is used as an index. usecols : list-like or callable, default ``None`` Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings From 3ee836308aad4c58014c16a40f976852e75b3837 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Apr 2020 20:21:51 -0700 Subject: [PATCH 26/29] Remove Block.concat_same_type --- pandas/core/internals/blocks.py | 55 ------------------------------- pandas/core/internals/concat.py | 14 ++++++-- pandas/core/internals/managers.py | 2 -- 3 files changed, 12 insertions(+), 59 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d8875b38ed738..e6d7397f90b65 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -48,7 +48,6 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_categorical, concat_datetime from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -110,7 +109,6 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): self.ndim = self._check_ndim(values, ndim) @@ -309,16 +307,6 @@ def shape(self): def dtype(self): return self.values.dtype - def concat_same_type(self, to_concat): - """ - Concatenate list of single blocks of the same type. - """ - values = self._concatenator( - [blk.values for blk in to_concat], axis=self.ndim - 1 - ) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - return self.make_block_same_class(values, placement=placement) - def iget(self, i): return self.values[i] @@ -1772,14 +1760,6 @@ def _slice(self, slicer): return self.values[slicer] - def concat_same_type(self, to_concat): - """ - Concatenate list of single blocks of the same type. - """ - values = self._holder._concat_same_type([blk.values for blk in to_concat]) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - return self.make_block_same_class(values, placement=placement) - def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) @@ -2261,20 +2241,6 @@ def diff(self, n: int, axis: int = 0) -> List["Block"]: new_values = new_values.astype("timedelta64[ns]") return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - def concat_same_type(self, to_concat): - # need to handle concat([tz1, tz2]) here, since DatetimeArray - # only handles cases where all the tzs are the same. - # Instead of placing the condition here, it could also go into the - # is_uniform_join_units check, but I'm not sure what is better. - if len({x.dtype for x in to_concat}) > 1: - values = concat_datetime([x.values for x in to_concat]) - - values = values.astype(object, copy=False) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - - return self.make_block(values, placement=placement) - return super().concat_same_type(to_concat) - def fillna(self, value, limit=None, inplace=False, downcast=None): # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. @@ -2645,7 +2611,6 @@ class CategoricalBlock(ExtensionBlock): is_categorical = True _verify_integrity = True _can_hold_na = True - _concatenator = staticmethod(concat_categorical) should_store = Block.should_store @@ -2659,26 +2624,6 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return Categorical - def concat_same_type(self, to_concat): - """ - Concatenate list of single blocks of the same type. - - Note that this CategoricalBlock._concat_same_type *may* not - return a CategoricalBlock. When the categories in `to_concat` - differ, this will return an object ndarray. - - If / when we decide we don't like that behavior: - - 1. Change Categorical._concat_same_type to use union_categoricals - 2. Delete this method. - """ - values = self._concatenator( - [blk.values for blk in to_concat], axis=self.ndim - 1 - ) - placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - # not using self.make_block_same_class as values can be object dtype - return self.make_block(values, placement=placement) - def replace( self, to_replace, diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 3f06d80714623..97edfec3985b0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -62,8 +62,18 @@ def concatenate_block_managers( values = values.view() b = b.make_block_same_class(values, placement=placement) elif _is_uniform_join_units(join_units): - b = join_units[0].block.concat_same_type([ju.block for ju in join_units]) - b.mgr_locs = placement + blk = join_units[0].block + vals = [ju.block.values for ju in join_units] + if not blk.is_extension: + values = concat_compat(vals, axis=blk.ndim - 1) + elif blk.is_datetimetz or blk.is_categorical: + # These can have the same type but multiple dtypes, + # we concatting does not necessarily preserve dtype + values = concat_compat(vals, axis=blk.ndim - 1) + elif blk.is_extension: + values = blk._holder._concat_same_type(vals) + + b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 436ef140c1a3b..f3b4ebad9cec1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, - is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -43,7 +42,6 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, - _block_shape, _extend_blocks, _safe_reshape, get_block_type, From 41d6da0c351795b3983b8bf9f480824612821318 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 11 Apr 2020 13:45:49 -0700 Subject: [PATCH 27/29] use concat_compat --- pandas/core/internals/concat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 97edfec3985b0..a57bd46d0e033 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -64,14 +64,15 @@ def concatenate_block_managers( elif _is_uniform_join_units(join_units): blk = join_units[0].block vals = [ju.block.values for ju in join_units] + if not blk.is_extension: values = concat_compat(vals, axis=blk.ndim - 1) elif blk.is_datetimetz or blk.is_categorical: # These can have the same type but multiple dtypes, # we concatting does not necessarily preserve dtype values = concat_compat(vals, axis=blk.ndim - 1) - elif blk.is_extension: - values = blk._holder._concat_same_type(vals) + else: + values = concat_compat(vals) b = make_block(values, placement=placement, ndim=blk.ndim) else: From 2e070ca6aa7a95caa22111375109062ba93398a7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 12 Apr 2020 14:50:07 -0700 Subject: [PATCH 28/29] combine cases --- pandas/core/internals/concat.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a57bd46d0e033..37e081aeba3f6 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -65,13 +65,12 @@ def concatenate_block_managers( blk = join_units[0].block vals = [ju.block.values for ju in join_units] - if not blk.is_extension: - values = concat_compat(vals, axis=blk.ndim - 1) - elif blk.is_datetimetz or blk.is_categorical: - # These can have the same type but multiple dtypes, - # we concatting does not necessarily preserve dtype + if not blk.is_extension or blk.is_datetimetz or blk.is_categorical: + # datetimetz and categorical can have the same type but multiple + # dtypes, concatting does not necessarily preserve dtype values = concat_compat(vals, axis=blk.ndim - 1) else: + # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) b = make_block(values, placement=placement, ndim=blk.ndim) From 675a94822d811ae04b22d04e955cd6c396f8302c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Apr 2020 07:05:55 -0700 Subject: [PATCH 29/29] Dummy commit to force CI