From a21ce87329c320e8483f63f226d90587c79ae9a3 Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Fri, 7 Aug 2020 16:21:11 +0100 Subject: [PATCH 01/51] BUG: GroupBy.count() and GroupBy.sum() incorreclty return NaN instead of 0 for missing categories (#35280) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/groupby/generic.py | 8 ++++- pandas/core/groupby/groupby.py | 16 +++++++-- pandas/tests/groupby/test_categorical.py | 46 +++++++----------------- pandas/tests/reshape/test_pivot.py | 13 +++---- 5 files changed, 42 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 10dfd8406b8ce..260b92b5989c1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -152,6 +152,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - @@ -160,7 +161,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Sparse diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c50b753cf3293..740463f0cf356 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1829,7 +1829,13 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(blocks, items=data.items) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _wrap_agged_blocks() returns. GH 35028 + with com.temp_setattr(self, "observed", True): + result = self._wrap_agged_blocks(blocks, items=data.items) + + return self._reindex_output(result, fill_value=0) def nunique(self, dropna: bool = True): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6c8a780859939..ed512710295d7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1536,9 +1536,19 @@ def size(self) -> FrameOrSeriesUnion: @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum - ) + + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0d447a70b540d..c74c1529eb537 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -19,7 +19,7 @@ import pandas._testing as tm -def cartesian_product_for_groupers(result, args, names): +def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -33,7 +33,7 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) - return result.reindex(index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() _results_for_groupbys_with_missing_categories = dict( @@ -309,7 +309,7 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0 ) tm.assert_frame_equal(result, expected) @@ -319,7 +319,9 @@ def test_observed(observed): expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + expected = cartesian_product_for_groupers( + expected, [cat1, cat2], list("AB"), fill_value=0 + ) tm.assert_frame_equal(result, expected) @@ -1189,6 +1191,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): ).sortlevel() expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + if operation == "agg": + expected = expected.fillna(0, downcast="infer") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1338,15 +1342,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1367,8 +1362,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) - # If we expect unobserved values to be zero, we also expect the dtype to be int - if zero_or_nan == 0: + # If we expect unobserved values to be zero, we also expect the dtype to be int. + # Except for .sum(). If the observed categories sum to dtype=float (i.e. their + # sums have decimals), then the zeros for the missing categories should also be + # floats. + if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1410,24 +1408,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": # GH 35028 - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0. See GH 35028" - ) - ) - request.node.add_marker(mark) - - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c07a5673fe503..67b3151b0ff9c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed): ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") - expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed): values="Sales", index="Month", columns="Year", - dropna=observed, + observed=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( - ["January"], categories=months, ordered=False, name="Month" + months, categories=months, ordered=False, name="Month" ) + expected_data = [[320, 120]] + [[0, 0]] * 11 expected = pd.DataFrame( - [[320, 120]], index=expected_index, columns=expected_columns + expected_data, index=expected_index, columns=expected_columns ) - if not observed: - result = result.dropna().astype(np.int64) + if observed: + expected = expected.loc[["January"]] tm.assert_frame_equal(result, expected) From ce03883911ec35c3eea89ed828765ec4be415263 Mon Sep 17 00:00:00 2001 From: SylvainLan Date: Fri, 7 Aug 2020 17:32:26 +0200 Subject: [PATCH 02/51] To latex position (#35284) --- pandas/core/generic.py | 5 +++ pandas/io/formats/format.py | 2 + pandas/io/formats/latex.py | 42 +++++++++++++-------- pandas/tests/io/formats/test_to_latex.py | 48 ++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 441eef26bea58..6fd55c58ece40 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2840,6 +2840,7 @@ def to_latex( multirow=None, caption=None, label=None, + position=None, ): r""" Render object to a LaTeX tabular, longtable, or nested table/tabular. @@ -2925,6 +2926,9 @@ def to_latex( This is used with ``\ref{}`` in the main ``.tex`` file. .. versionadded:: 1.0.0 + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{}`` in the output. %(returns)s See Also -------- @@ -2986,6 +2990,7 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ) def to_csv( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c05f79f935548..81990b3d505e1 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -931,6 +931,7 @@ def to_latex( multirow: bool = False, caption: Optional[str] = None, label: Optional[str] = None, + position: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a LaTeX tabular/longtable environment output. @@ -946,6 +947,7 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ).get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 3a3ca84642d51..5d6f0a08ef2b5 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -38,6 +38,7 @@ def __init__( multirow: bool = False, caption: Optional[str] = None, label: Optional[str] = None, + position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame @@ -50,6 +51,8 @@ def __init__( self.caption = caption self.label = label self.escape = self.fmt.escape + self.position = position + self._table_float = any(p is not None for p in (caption, label, position)) def write_result(self, buf: IO[str]) -> None: """ @@ -284,7 +287,7 @@ def _write_tabular_begin(self, buf, column_format: str): `__ e.g 'rcl' for 3 columns """ - if self.caption is not None or self.label is not None: + if self._table_float: # then write output in a nested table/tabular environment if self.caption is None: caption_ = "" @@ -296,7 +299,12 @@ def _write_tabular_begin(self, buf, column_format: str): else: label_ = f"\n\\label{{{self.label}}}" - buf.write(f"\\begin{{table}}\n\\centering{caption_}{label_}\n") + if self.position is None: + position_ = "" + else: + position_ = f"[{self.position}]" + + buf.write(f"\\begin{{table}}{position_}\n\\centering{caption_}{label_}\n") else: # then write output only in a tabular environment pass @@ -317,7 +325,7 @@ def _write_tabular_end(self, buf): """ buf.write("\\bottomrule\n") buf.write("\\end{tabular}\n") - if self.caption is not None or self.label is not None: + if self._table_float: buf.write("\\end{table}\n") else: pass @@ -337,25 +345,29 @@ def _write_longtable_begin(self, buf, column_format: str): `__ e.g 'rcl' for 3 columns """ - buf.write(f"\\begin{{longtable}}{{{column_format}}}\n") + if self.caption is None: + caption_ = "" + else: + caption_ = f"\\caption{{{self.caption}}}" - if self.caption is not None or self.label is not None: - if self.caption is None: - pass - else: - buf.write(f"\\caption{{{self.caption}}}") + if self.label is None: + label_ = "" + else: + label_ = f"\\label{{{self.label}}}" - if self.label is None: - pass - else: - buf.write(f"\\label{{{self.label}}}") + if self.position is None: + position_ = "" + else: + position_ = f"[{self.position}]" + buf.write( + f"\\begin{{longtable}}{position_}{{{column_format}}}\n{caption_}{label_}" + ) + if self.caption is not None or self.label is not None: # a double-backslash is required at the end of the line # as discussed here: # https://tex.stackexchange.com/questions/219138 buf.write("\\\\\n") - else: - pass @staticmethod def _write_longtable_end(buf): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 509e5bcb33304..93ad3739e59c7 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -573,6 +573,54 @@ def test_to_latex_longtable_caption_label(self): """ assert result_cl == expected_cl + def test_to_latex_position(self): + the_position = "h" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the position is provided + result_p = df.to_latex(position=the_position) + + expected_p = r"""\begin{table}[h] +\centering +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_p == expected_p + + def test_to_latex_longtable_position(self): + the_position = "t" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the position is provided + result_p = df.to_latex(longtable=True, position=the_position) + + expected_p = r"""\begin{longtable}[t]{lrl} +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_p == expected_p + def test_to_latex_escape_special_chars(self): special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) From dfa546eeb9a95fb5dee1e95009c042364287fdc1 Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Fri, 7 Aug 2020 17:56:12 +0100 Subject: [PATCH 03/51] BUG: GroupBy.apply() returns different results if a different GroupBy method is called first (#35314) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/groupby.py | 89 ++++++++++---------- pandas/tests/groupby/aggregate/test_other.py | 4 +- pandas/tests/groupby/test_apply.py | 29 +++++++ pandas/tests/groupby/test_grouping.py | 8 +- 5 files changed, 82 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 260b92b5989c1..5a4fa3150e344 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -156,7 +156,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - - +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ed512710295d7..4597afeeaddbf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -736,13 +736,12 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): assert name in self._apply_allowlist - self._set_group_selection() - - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._obj_with_exclusions, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) + with _group_selection_context(self): + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._obj_with_exclusions, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) @@ -992,28 +991,28 @@ def _agg_general( alias: str, npfunc: Callable, ): - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general( - how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes + with _group_selection_context(self): + # try a cython aggregation if we can + try: + return self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) + except DataError: pass - else: - raise - - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1940,29 +1939,31 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() + with _group_selection_context(self): - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 264cf40dc6984..e8cd6017a117c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 665cd12225ad7..ee38722ffb8ce 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1009,6 +1009,35 @@ def test_apply_with_timezones_aware(): tm.assert_frame_equal(result1, result2) +def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): + # GH #34656 + # GH #34271 + df = DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + } + ) + + expected = pd.DataFrame( + {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + index=pd.Index([88, 99], name="a"), + ) + + # Check output when no other methods are called before .apply() + grp = df.groupby(by="a") + result = grp.apply(sum) + tm.assert_frame_equal(result, expected) + + # Check output when another method is called before .apply() + grp = df.groupby(by="a") + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + _ = getattr(grp, reduction_func)(*args) + result = grp.apply(sum) + tm.assert_frame_equal(result, expected) + + def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): # GH 29617 diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index efcd22f9c0c82..40b4ce46e550b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -191,13 +191,15 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) - tm.assert_frame_equal(result, expected) - g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) + result = g.apply(lambda x: x.sum()) + expected["A"] = [0, 2, 4] + expected = expected.loc[:, ["A", "B"]] + tm.assert_frame_equal(result, expected) + # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame( From 9843926e39dee88f689afc51cbf0f2fdc232dcd3 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 7 Aug 2020 12:58:23 -0400 Subject: [PATCH 04/51] CLN: clarify TypeError for IndexSlice argument to pd.xs (#35411) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/generic.py | 5 ++++- pandas/tests/indexing/multiindex/test_xs.py | 23 ++++++++++++++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5a4fa3150e344..15616f4a6f27c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -134,7 +134,7 @@ Missing MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message `Expected label or tuple of labels` (:issue:`35301`) - I/O diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6fd55c58ece40..843b602a12823 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3492,7 +3492,10 @@ class animal locomotion index = self.index if isinstance(index, MultiIndex): - loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + try: + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + except TypeError as e: + raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: loc = self.index.get_loc(key) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index b807795b9c309..91be1d913001b 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +from pandas import DataFrame, Index, IndexSlice, MultiIndex, Series, concat, date_range import pandas._testing as tm import pandas.core.common as com @@ -220,6 +220,27 @@ def test_xs_level_series_slice_not_implemented( s[2000, 3:4] +def test_xs_IndexSlice_argument_not_implemented(): + # GH 35301 + + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + msg = ( + "Expected label or tuple of labels, got " + r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" + ) + with pytest.raises(TypeError, match=msg): + frame.xs(IndexSlice[("foo", "qux", 0), :]) + with pytest.raises(TypeError, match=msg): + series.xs(IndexSlice[("foo", "qux", 0), :]) + + def test_series_getitem_multiindex_xs(): # GH6258 dt = list(date_range("20130903", periods=3)) From 1922ec414d50f88fb1db7d219159222f4dc89674 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 7 Aug 2020 11:09:30 -0700 Subject: [PATCH 05/51] BUG: Ensure rolling groupby doesn't segfault with center=True (#35562) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/window/indexers.py | 6 +++ pandas/tests/window/test_grouper.py | 65 +++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 6b315e0a9d016..a044a4aab284e 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0898836ed2e0e..bc36bdca982e8 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -319,4 +319,10 @@ def get_window_bounds( end_arrays.append(window_indicies.take(end)) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) + # GH 35552: Need to adjust start and end based on the nans appended to values + # when center=True + if num_values > len(start): + offset = num_values - len(start) + start = np.concatenate([start, np.array([end[-1]] * offset)]) + end = np.concatenate([end, np.array([end[-1]] * offset)]) return start, end diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index ca5a9eccea4f5..5241b9548a442 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -215,6 +215,71 @@ def foo(x): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_center_center(self): + # GH 35552 + series = Series(range(1, 6)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 5, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))), + ) + tm.assert_series_equal(result, expected) + + series = Series(range(1, 5)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 4, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))), + ) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ("b", 10), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( From 23dcab58412900954de7dee1ab1bbdf257c903c9 Mon Sep 17 00:00:00 2001 From: Deepak Pandey <40865954+freakypandit@users.noreply.github.com> Date: Sat, 8 Aug 2020 00:46:39 +0530 Subject: [PATCH 06/51] DOC: Docstring updated for DataFrame.equals (#34508) Co-authored-by: Joris Van den Bossche --- pandas/core/generic.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 843b602a12823..87f25f578c3c6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1201,9 +1201,11 @@ def equals(self, other): This function allows two Series or DataFrames to be compared against each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. The column headers do not - need to have the same type, but the elements within the columns must - be the same dtype. + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. Parameters ---------- @@ -1232,13 +1234,6 @@ def equals(self, other): numpy.array_equal : Return True if two arrays have the same shape and elements, False otherwise. - Notes - ----- - This function requires that the elements have the same dtype as their - respective elements in the other Series or DataFrame. However, the - column labels do not need to have the same type, as long as they are - still considered equal. - Examples -------- >>> df = pd.DataFrame({1: [10], 2: [20]}) From 319a6d3f0d93c372bd53498168b957aaf5daa174 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 7 Aug 2020 12:35:19 -0700 Subject: [PATCH 07/51] REF: use unpack_zerodim_and_defer on EA methods (#34042) --- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/boolean.py | 19 ++++--------------- pandas/core/arrays/numpy_.py | 9 +++------ pandas/core/arrays/string_.py | 8 +++----- pandas/tests/extension/base/ops.py | 15 +++++++++++---- pandas/tests/extension/test_period.py | 6 +++++- 6 files changed, 28 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2553a65aed07b..921927325a144 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -1273,7 +1273,7 @@ def convert_values(param): ovalues = [param] * len(self) return ovalues - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCSeries, ABCIndexClass, ABCDataFrame)): # rely on pandas to unbox and dispatch to us return NotImplemented diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dbce71b77a425..bd4bdc5ecb46f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -20,7 +20,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -559,13 +558,10 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} - other = lib.item_from_zerodim(other) other_is_booleanarray = isinstance(other, BooleanArray) other_is_scalar = lib.is_scalar(other) mask = None @@ -605,16 +601,14 @@ def logical_method(self, other): @classmethod def _create_comparison_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): from pandas.arrays import IntegerArray - if isinstance( - other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) - ): + if isinstance(other, IntegerArray): # Rely on pandas to unbox and dispatch to us. return NotImplemented - other = lib.item_from_zerodim(other) mask = None if isinstance(other, BooleanArray): @@ -693,13 +687,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @ops.unpack_zerodim_and_defer(op_name) def boolean_arithmetic_method(self, other): - - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None if isinstance(other, BooleanArray): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index f6dfb1f0f1e62..05f901518d82f 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -11,12 +11,11 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas import compat -from pandas.core import nanops +from pandas.core import nanops, ops from pandas.core.algorithms import searchsorted from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -436,11 +435,9 @@ def __invert__(self): @classmethod def _create_arithmetic_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def arithmetic_method(self, other): - if isinstance(other, (ABCIndexClass, ABCSeries)): - return NotImplemented - - elif isinstance(other, cls): + if isinstance(other, cls): other = other._ndarray with np.errstate(all="ignore"): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fddd3af858f77..bb55c3cdea45c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -7,7 +7,6 @@ from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas import compat @@ -312,15 +311,14 @@ def memory_usage(self, deep=False): @classmethod def _create_arithmetic_method(cls, op): # Note: this handles both arithmetic and comparison methods. + + @ops.unpack_zerodim_and_defer(op.__name__) def method(self, other): from pandas.arrays import BooleanArray assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS - if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): - return NotImplemented - - elif isinstance(other, cls): + if isinstance(other, cls): other = other._ndarray mask = isna(self) | isna(other) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 359acf230ce14..c93603398977e 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -114,10 +114,13 @@ def test_error(self, data, all_arithmetic_operators): with pytest.raises(AttributeError): getattr(data, op_name) - def test_direct_arith_with_series_returns_not_implemented(self, data): - # EAs should return NotImplemented for ops with Series. + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): + # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() if hasattr(data, "__add__"): result = data.__add__(other) assert result is NotImplemented @@ -156,10 +159,14 @@ def test_compare_array(self, data, all_compare_operators): other = pd.Series([data[0]] * len(data)) self._compare_other(s, data, op_name, other) - def test_direct_arith_with_series_returns_not_implemented(self, data): - # EAs should return NotImplemented for ops with Series. + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): + # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() + if hasattr(data, "__eq__"): result = data.__eq__(other) assert result is NotImplemented diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b1eb276bfc227..817881e00fa99 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -126,9 +126,13 @@ def test_add_series_with_extension_array(self, data): def test_error(self): pass - def test_direct_arith_with_series_returns_not_implemented(self, data): + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): # Override to use __sub__ instead of __add__ other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() + result = data.__sub__(other) assert result is NotImplemented From 067f86f659db3f7021957f935dd5d00bf6eec1aa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 7 Aug 2020 14:03:00 -0700 Subject: [PATCH 08/51] PERF: BlockManager.equals blockwise (#35357) --- pandas/core/dtypes/missing.py | 14 ++++++- pandas/core/internals/managers.py | 27 ++------------ pandas/core/internals/ops.py | 61 ++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 41 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8551ce9f14e6c..f59bb31af2828 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ from pandas._libs import lib import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from pandas._typing import DtypeObj +from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -484,6 +484,18 @@ def _array_equivalent_object(left, right, strict_nan): return True +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ABCExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right, dtype_equal=True) + + def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4693cc193c27c..4b85f92391dce 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, - is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -28,10 +27,9 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -49,7 +47,7 @@ get_block_type, make_block, ) -from pandas.core.internals.ops import operate_blockwise +from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None @@ -1449,26 +1447,9 @@ def equals(self, other: "BlockManager") -> bool: return False left = self.blocks[0].values right = other.blocks[0].values - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - return left.equals(right) - else: - return array_equivalent(left, right) + return array_equals(left, right) - for i in range(len(self.items)): - # Check column-wise, return False if any column doesn't match - left = self.iget_values(i) - right = other.iget_values(i) - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - if not left.equals(right): - return False - else: - if not array_equivalent(left, right, dtype_equal=True): - return False - return True + return blockwise_all(self, other, array_equals) def unstack(self, unstacker, fill_value) -> "BlockManager": """ diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 6eedf72726acb..ae4892c720d5b 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, List, Tuple +from collections import namedtuple +from typing import TYPE_CHECKING, Iterator, List, Tuple import numpy as np @@ -9,13 +10,17 @@ from pandas.core.internals.managers import BlockManager # noqa:F401 -def operate_blockwise( - left: "BlockManager", right: "BlockManager", array_op -) -> "BlockManager": +BlockPairInfo = namedtuple( + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"], +) + + +def _iter_block_pairs( + left: "BlockManager", right: "BlockManager" +) -> Iterator[BlockPairInfo]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) - res_blks: List["Block"] = [] for n, blk in enumerate(left.blocks): locs = blk.mgr_locs blk_vals = blk.values @@ -34,21 +39,32 @@ def operate_blockwise( right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) + yield info - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) +def operate_blockwise( + left: "BlockManager", right: "BlockManager", array_op +) -> "BlockManager": + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + res_blks: List["Block"] = [] + for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - _reset_block_mgr_locs(nbs, locs) + _reset_block_mgr_locs(nbs, locs) - res_blks.extend(nbs) + res_blks.extend(nbs) # Assertions are disabled for performance, but should hold: # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} @@ -85,7 +101,7 @@ def _get_same_shape_values( # Require that the indexing into lvals be slice-like assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs - # TODO(EA2D): with 2D EAs pnly this first clause would be needed + # TODO(EA2D): with 2D EAs only this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) @@ -102,3 +118,14 @@ def _get_same_shape_values( rvals = rvals[0, :] return lvals, rvals + + +def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: + """ + Blockwise `all` reduction. + """ + for info in _iter_block_pairs(left, right): + res = op(info.lvals, info.rvals) + if not res: + return False + return True From f194094f51ce44f3a759058720c7b63291f419af Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 7 Aug 2020 17:33:05 -0400 Subject: [PATCH 09/51] BUG: DataFrameGroupBy.__getitem__ fails to propagate dropna (#35078) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/generic.py | 18 +++++------ pandas/tests/groupby/test_groupby_dropna.py | 34 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 15616f4a6f27c..74ef5178eb004 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -128,7 +128,7 @@ Indexing Missing ^^^^^^^ -- +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) - MultiIndex diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 740463f0cf356..1fed193dba02c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -35,11 +35,11 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( + find_common_type, maybe_cast_result, maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -513,7 +513,6 @@ def _transform_general( """ Transform with a non-str `func`. """ - if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_transform" @@ -535,24 +534,23 @@ def _transform_general( if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - indexer = self._get_index(name) - ser = klass(res, indexer) - results.append(ser) + results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat - result = concat(results).sort_index() + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): - result = maybe_downcast_to_dtype(result, dtype) + if is_numeric_dtype(result.dtype): + common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) + if common_dtype is result.dtype: + result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name result.index = self._selected_obj.index diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1a525d306e9f5..adf62c4723526 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,6 +162,40 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dropna,df_expected,s_expected", + [ + pytest.param( + True, + pd.DataFrame({"B": [2, 2, 1]}), + pd.Series(data=[2, 2, 1], name="B"), + marks=pytest.mark.xfail(raises=ValueError), + ), + ( + False, + pd.DataFrame({"B": [2, 2, 1, 1]}), + pd.Series(data=[2, 2, 1, 1], name="B"), + ), + ], +) +def test_slice_groupby_then_transform(dropna, df_expected, s_expected): + # GH35014 + + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=dropna) + + res = gb.transform(len) + tm.assert_frame_equal(res, df_expected) + + gb_slice = gb[["B"]] + res = gb_slice.transform(len) + tm.assert_frame_equal(res, df_expected) + + gb_slice = gb["B"] + res = gb["B"].transform(len) + tm.assert_series_equal(res, s_expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From 16bd49d8a7e4e28acac3b64f1923bd07694bc963 Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Fri, 7 Aug 2020 23:35:21 +0200 Subject: [PATCH 10/51] BUG: fix styler cell_ids arg so that blank style is ignored on False (#35588) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/io/formats/style.py | 2 +- pandas/tests/io/formats/test_style.py | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index a044a4aab284e..ade88a6127014 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -27,6 +27,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). Categorical ^^^^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index fd1efa2d1b668..584f42a6cab12 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -390,7 +390,7 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style - if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) row_es.append(row_dict) props = [] diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 9c6910637fa7e..3ef5157655e78 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1682,6 +1682,12 @@ def f(a, b, styler): result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) + def test_no_cell_ids(self): + # GH 35588 + df = pd.DataFrame(data=[[0]]) + s = Styler(df, uuid="_", cell_ids=False).render() + assert s.find('') != -1 + @td.skip_if_no_mpl class TestStylerMatplotlibDep: From 92bf41a4707a8386fe3055bcbc959954da240071 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Fri, 7 Aug 2020 18:22:55 -0400 Subject: [PATCH 11/51] BUG: assign consensus name to index union in array case GH13475 (#35338) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/api.py | 5 ++-- pandas/tests/frame/test_constructors.py | 36 ++++++++++++++++++++++++ pandas/tests/reshape/test_concat.py | 37 +++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 74ef5178eb004..33e70daa55e66 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -162,6 +162,7 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) +- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Sparse diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..30cc8cf480dcf 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -218,9 +218,8 @@ def conv(i): return result elif kind == "array": index = indexes[0] - for other in indexes[1:]: - if not index.equals(other): - return _unique_indices(indexes) + if not all(index.equals(other) for other in indexes[1:]): + index = _unique_indices(indexes) name = get_consensus_names(indexes)[0] if name != index.name: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b78bb1c492ef4..d0f774344a33d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1618,6 +1618,42 @@ def test_constructor_Series_differently_indexed(self): tm.assert_index_equal(df2.index, other_index) tm.assert_frame_equal(df2, exp2) + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, "idx"), + ("idx", None, None, "idx"), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + pd.Index(["a", "b", "c"], name=name_in1), + pd.Index(["b", "c", "d"], name=name_in2), + pd.Index(["c", "d", "e"], name=name_in3), + ] + series = { + c: pd.Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"]) + } + result = pd.DataFrame(series) + + exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) + expected = pd.DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + def test_constructor_manager_resize(self, float_frame): index = list(float_frame.index[:5]) columns = list(float_frame.columns[:3]) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0159fabd04d59..38cf2cc2402a1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1279,6 +1279,43 @@ def test_concat_ignore_index(self, sort): tm.assert_frame_equal(v1, expected) + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, "idx"), + ("idx", None, None, "idx"), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + pd.Index(["a", "b", "c"], name=name_in1), + pd.Index(["b", "c", "d"], name=name_in2), + pd.Index(["c", "d", "e"], name=name_in3), + ] + frames = [ + pd.DataFrame({c: [0, 1, 2]}, index=i) + for i, c in zip(indices, ["x", "y", "z"]) + ] + result = pd.concat(frames, axis=1) + + exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) + expected = pd.DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + def test_concat_multiindex_with_keys(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], From 47c17cbac9255c2e9b784eae1a8a7d7dfee19b59 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 7 Aug 2020 18:17:14 -0500 Subject: [PATCH 12/51] REGR: Fix conversion of mixed dtype DataFrame to numpy str (#35473) * Handle str better * Doc and test * Make an elif * Add back import --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/frame.py | 2 ++ pandas/core/internals/managers.py | 3 +++ pandas/tests/frame/test_api.py | 7 +++++++ 4 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index ade88a6127014..f0ad9d1ca3b0f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aabdac16e9a1a..b66b6b92336f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1371,6 +1371,8 @@ def to_numpy( result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4b85f92391dce..aa74d173d69b3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, + is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -865,6 +866,8 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" result = np.empty(self.shape, dtype=dtype) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b79fc8cd3406..cc57a3970d18b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -367,6 +367,13 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) + def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) From fa92ece122e18d65754dcd26324ed80326f249c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 7 Aug 2020 21:27:39 -0400 Subject: [PATCH 13/51] DOC: corrected statement about compression support for file objects in to_csv (#35615) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 87f25f578c3c6..834cd992f5650 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3088,7 +3088,7 @@ def to_csv( .. versionchanged:: 1.2.0 - Compression is supported for non-binary file objects. + Compression is supported for binary file objects. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` From a9cb64ad105f8f7051f9d77b66e1bfe7c09d386f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 8 Aug 2020 10:47:43 +0100 Subject: [PATCH 14/51] CI: Linux py36_locale failures with pytest DeprecationWarning (#35621) --- ci/deps/azure-36-locale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index a9b9a5a47ccf5..3034ed3dc43af 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 From 188ce7301329e6791df4a71b87b28922e00a6bf8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 8 Aug 2020 07:51:39 -0700 Subject: [PATCH 15/51] REF: Avoid post-processing in blockwise op (#35356) --- pandas/core/groupby/generic.py | 97 ++++++++++++++++------------------ 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1fed193dba02c..53242c0332a8c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1029,11 +1029,36 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] - # Some object-dtype blocks might be split into List[Block[T], Block[U]] - split_items: List[np.ndarray] = [] - split_frames: List[DataFrame] = [] no_result = object() + + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + assert result is not no_result + + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + # TODO(EA2D): special casing not needed with 2D EAs + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except (ValueError, TypeError): + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + + agg_block: Block = block.make_block(result) + return agg_block + for block in data.blocks: # Avoid inheriting result from earlier in the loop result = no_result @@ -1065,9 +1090,9 @@ def _cython_agg_blocks( # not try to add missing categories if grouping over multiple # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 - s = get_groupby(obj, self.grouper, observed=True) + sgb = get_groupby(obj, self.grouper, observed=True) try: - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block @@ -1081,54 +1106,26 @@ def _cython_agg_blocks( # about a single block input returning a single block output # is a lie. To keep the code-path for the typical non-split case # clean, we choose to clean up this mess later on. - split_items.append(locs) - split_frames.append(result) - continue - - assert len(result._mgr.blocks) == 1 - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block to the desired dtype - # this may not be the original dtype - dtype = maybe_cast_result_dtype(block.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - # TODO(EA2D): special casing not needed with 2D EAs - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except (ValueError, TypeError): - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) - - agg_block: Block = block.make_block(result) - - new_items.append(locs) - agg_blocks.append(agg_block) + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_blocks.append(agg_block) + else: + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) + else: + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) - if not (agg_blocks or split_frames): + if not agg_blocks: raise DataError("No numeric types to aggregate") - if split_items: - # Clean up the mess left over from split blocks. - for locs, result in zip(split_items, split_frames): - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) - agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) - # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) From 71a327c4856e05dd761f0643aaa46903626e909c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 8 Aug 2020 15:52:20 +0100 Subject: [PATCH 16/51] DOC: docstrings for __array_wrap__ (#35629) --- pandas/core/generic.py | 23 ++++++++++++++++++++++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/period.py | 9 ++++++--- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 834cd992f5650..fcb7e2a949205 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1772,7 +1772,28 @@ def empty(self) -> bool_t: def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._values, dtype=dtype) - def __array_wrap__(self, result, context=None): + def __array_wrap__( + self, + result: np.ndarray, + context: Optional[Tuple[Callable, Tuple[Any, ...], int]] = None, + ): + """ + Gets called after a ufunc and other functions. + + Parameters + ---------- + result: np.ndarray + The result of the ufunc or other function called on the NumPy array + returned by __array__ + context: tuple of (func, tuple, int) + This parameter is returned by ufuncs as a 3-element tuple: (name of the + ufunc, arguments of the ufunc, domain of the ufunc), but is not set by + other numpy functions.q + + Notes + ----- + Series implements __array_ufunc_ so this not called for ufunc on Series. + """ result = lib.item_from_zerodim(result) if is_scalar(result): # e.g. we get here with np.ptp(series) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bfdfbd35f27ad..bd75a064b483e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -574,7 +574,7 @@ def __array__(self, dtype=None) -> np.ndarray: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0ce057d6e764a..8ccdab21339df 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -116,7 +116,7 @@ def values(self): def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result): diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c7199e4a28a17..11334803d4583 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -345,10 +345,13 @@ def _int64index(self) -> Int64Index: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. Needs additional handling as - PeriodIndex stores internal data as int dtype + Gets called after a ufunc and other functions. - Replace this to __numpy_ufunc__ in future version + Needs additional handling as PeriodIndex stores internal data as int + dtype + + Replace this to __numpy_ufunc__ in future version and implement + __array_function__ for Indexes """ if isinstance(context, tuple) and len(context) > 0: func = context[0] From aefae55e1960a718561ae0369e83605e3038f292 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 8 Aug 2020 18:49:24 +0100 Subject: [PATCH 17/51] TYP: update setup.cfg (#35628) --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 84c281b756395..e4c0b3dcf37ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -175,9 +175,6 @@ check_untyped_defs=False [mypy-pandas.core.groupby.ops] check_untyped_defs=False -[mypy-pandas.core.indexes.base] -check_untyped_defs=False - [mypy-pandas.core.indexes.datetimes] check_untyped_defs=False @@ -214,9 +211,6 @@ check_untyped_defs=False [mypy-pandas.core.window.common] check_untyped_defs=False -[mypy-pandas.core.window.ewm] -check_untyped_defs=False - [mypy-pandas.core.window.expanding] check_untyped_defs=False From 6875a050d13f811b34c0064837bb781aca1cffcb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 10 Aug 2020 06:13:30 -0700 Subject: [PATCH 18/51] BUG: RollingGroupby with closed and column selection no longer raises ValueError (#35639) --- doc/source/whatsnew/v1.1.1.rst | 4 +++ pandas/core/window/common.py | 2 +- pandas/core/window/rolling.py | 10 ++---- pandas/tests/window/test_grouper.py | 51 +++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index f0ad9d1ca3b0f..7f5182e3eaa6f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -51,6 +51,10 @@ Categorical - - +**Groupby/resample/rolling** + +- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) + **Plotting** - diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 58e7841d4dde5..51a067427e867 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -52,7 +52,7 @@ def __init__(self, obj, *args, **kwargs): kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: - groupby, obj = obj, obj.obj + groupby, obj = obj, obj._selected_obj self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index a04d68a6d6745..7347d5686aabc 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2212,7 +2212,7 @@ def _apply( # Cannot use _wrap_outputs because we calculate the result all at once # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self._groupby._selected_obj.index + grouped_object_index = self.obj.index grouped_index_name = [grouped_object_index.name] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name @@ -2236,10 +2236,6 @@ def _apply( def _constructor(self): return Rolling - @cache_readonly - def _selected_obj(self): - return self._groupby._selected_obj - def _create_blocks(self, obj: FrameOrSeries): """ Split data into blocks & return conformed data. @@ -2278,7 +2274,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] if self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self._groupby._selected_obj.index.asi8 + index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2295,7 +2291,7 @@ def _gotitem(self, key, ndim, subset=None): # here so our index is carried thru to the selected obj # when we do the splitting for the groupby if self.on is not None: - self._groupby.obj = self._groupby.obj.set_index(self._on) + self.obj = self.obj.set_index(self._on) self.on = None return super()._gotitem(key, ndim, subset=subset) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 5241b9548a442..e1dcac06c39cc 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -304,3 +304,54 @@ def test_groupby_subselect_rolling(self): name="b", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + result = ( + df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_subset_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + + result = ( + df.groupby("group")[["column1", "date"]] + .rolling("1D", on="date", closed="left")["column1"] + .sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) From 5dee73b172f04d6118edf4337819fe592e154163 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 10 Aug 2020 14:18:53 +0100 Subject: [PATCH 19/51] DEPR: Deprecate inplace param in MultiIndex.set_codes and MultiIndex.set_levels (#35626) --- doc/source/user_guide/indexing.rst | 8 +-- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/conftest.py | 2 +- pandas/core/indexes/multi.py | 26 ++++++++- pandas/tests/frame/methods/test_sort_index.py | 4 +- pandas/tests/indexes/multi/test_compat.py | 6 +- pandas/tests/indexes/multi/test_duplicates.py | 3 +- .../tests/indexes/multi/test_equivalence.py | 6 +- pandas/tests/indexes/multi/test_get_set.py | 55 ++++++++++++++----- pandas/tests/indexes/multi/test_integrity.py | 3 +- .../tests/indexing/multiindex/test_sorted.py | 8 ++- pandas/tests/reshape/test_melt.py | 4 +- pandas/tests/test_multilevel.py | 4 +- 13 files changed, 94 insertions(+), 37 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 6843dd1eadc81..cac18f5bf39cd 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1532,12 +1532,8 @@ Setting metadata ~~~~~~~~~~~~~~~~ Indexes are "mostly immutable", but it is possible to set and change their -metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``codes``). - -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes`` -to set these attributes directly. They default to returning a copy; however, -you can specify ``inplace=True`` to have the data change in place. +``name`` attribute. You can use the ``rename``, ``set_names`` to set these attributes +directly, and they default to returning a copy. See :ref:`Advanced Indexing ` for usage of MultiIndexes. diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 33e70daa55e66..d3bccada09c29 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -47,7 +47,7 @@ Other enhancements Deprecations ~~~~~~~~~~~~ - +- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - - diff --git a/pandas/conftest.py b/pandas/conftest.py index e0adb37e7d2f5..c1925b4f5ca3b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -359,7 +359,7 @@ def multiindex_year_month_day_dataframe_random_data(): tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index = ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels]) ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a6e8ec0707de7..13927dede5542 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -740,7 +740,7 @@ def _set_levels( self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): + def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -752,6 +752,8 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool, default True If True, checks that levels and codes are compatible. @@ -822,6 +824,15 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -898,7 +909,7 @@ def _set_codes( self._tuples = None self._reset_cache() - def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): + def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -914,6 +925,8 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool (default True) If True, checks that levels and codes are compatible. @@ -958,6 +971,15 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): (1, 'two')], names=['foo', 'bar']) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if level is not None and not is_list_like(level): if not is_list_like(codes): raise TypeError("Codes must be list-like") diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 5216c3be116e0..dcc33428d18a5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -555,8 +555,8 @@ def test_sort_index_and_reconstruction(self): ), ) - df.columns.set_levels( - pd.to_datetime(df.columns.levels[1]), level=1, inplace=True + df.columns = df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1 ) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index d1f66af4a8e83..b2500efef9e03 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -84,7 +84,8 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(mi1.values, vals) # Inplace should kill _tuples - mi1.set_levels(levels2, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi1.set_levels(levels2, inplace=True) tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too @@ -103,7 +104,8 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(exp_values, new_values) # ...and again setting inplace should kill _tuples, etc - mi2.set_codes(codes2, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi2.set_codes(codes2, inplace=True) tm.assert_almost_equal(mi2.values, new_values) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index e48731b9c8099..9add4b478da47 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -91,7 +91,8 @@ def test_duplicate_multiindex_codes(): mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) msg = r"Level values must be unique: \[[AB', ]+\] on level 0" with pytest.raises(ValueError, match=msg): - mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) @pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 063ede028add7..b48f09457b96c 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -192,10 +192,12 @@ def test_is_(): mi4 = mi3.view() # GH 17464 - Remove duplicate MultiIndex levels - mi4.set_levels([list(range(10)), list(range(10))], inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi4.set_levels([list(range(10)), list(range(10))], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() - mi5.set_levels(mi5.levels, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi5.set_levels(mi5.levels, inplace=True) assert not mi5.is_(mi) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 8a3deca0236e4..b9132f429905d 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -93,7 +93,8 @@ def test_set_levels(idx): # level changing [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels, inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) @@ -113,20 +114,23 @@ def test_set_levels(idx): # level changing specific level [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) assert inplace_return is None assert_matching(ind2.levels, [new_levels[0], levels[1]]) assert_matching(idx.levels, levels) ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) assert inplace_return is None assert_matching(ind2.levels, [levels[0], new_levels[1]]) assert_matching(idx.levels, levels) # level changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) assert_matching(idx.levels, levels) @@ -136,19 +140,23 @@ def test_set_levels(idx): original_index = idx.copy() for inplace in [True, False]: with pytest.raises(ValueError, match="^On"): - idx.set_levels(["c"], level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_levels(["c"], level=0, inplace=inplace) assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(ValueError, match="^On"): - idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) assert_matching(idx.codes, original_index.codes, check_dtype=True) with pytest.raises(TypeError, match="^Levels"): - idx.set_levels("c", level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_levels("c", level=0, inplace=inplace) assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(TypeError, match="^Codes"): - idx.set_codes(1, level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_codes(1, level=0, inplace=inplace) assert_matching(idx.codes, original_index.codes, check_dtype=True) @@ -168,7 +176,8 @@ def test_set_codes(idx): # changing label w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes, inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) @@ -188,20 +197,23 @@ def test_set_codes(idx): # label changing specific level w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) assert inplace_return is None assert_matching(ind2.codes, [new_codes[0], codes[1]]) assert_matching(idx.codes, codes) ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) assert inplace_return is None assert_matching(ind2.codes, [codes[0], new_codes[1]]) assert_matching(idx.codes, codes) # codes changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) assert_matching(idx.codes, codes) @@ -217,7 +229,8 @@ def test_set_codes(idx): # [w/ mutation] result = ind.copy() - result.set_codes(codes=new_codes, level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + result.set_codes(codes=new_codes, level=1, inplace=True) assert result.equals(expected) @@ -329,3 +342,19 @@ def test_set_levels_with_iterable(): [expected_sizes, colors], names=["size", "color"] ) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_codes_inplace_deprecated(idx, inplace): + new_codes = idx.codes[1][::-1] + + with tm.assert_produces_warning(FutureWarning): + idx.set_codes(codes=new_codes, level=1, inplace=inplace) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_levels_inplace_deprecated(idx, inplace): + new_level = idx.levels[1].copy() + + with tm.assert_produces_warning(FutureWarning): + idx.set_levels(levels=new_level, level=1, inplace=inplace) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index fd150bb4d57a2..c776a33717ccd 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -220,7 +220,8 @@ def test_metadata_immutable(idx): def test_level_setting_resets_attributes(): ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert ind.is_monotonic - ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) + with tm.assert_produces_warning(FutureWarning): + ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 572cb9da405d1..bafe5068e1418 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -43,9 +43,13 @@ def test_frame_getitem_not_sorted2(self, key): df2 = df.set_index(["col1", "col2"]) df2_original = df2.copy() - return_value = df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + with tm.assert_produces_warning(FutureWarning): + return_value = df2.index.set_levels( + ["b", "d", "a"], level="col1", inplace=True + ) assert return_value is None - return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) + with tm.assert_produces_warning(FutureWarning): + return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) assert return_value is None assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 2b75a1ec6ca6e..79879ef346f53 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -799,7 +799,7 @@ def test_invalid_separator(self): expected = expected.set_index(["id", "year"])[ ["X", "A2010", "A2011", "B2010", "A", "B"] ] - expected.index.set_levels([0, 1], level=0, inplace=True) + expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) @@ -861,7 +861,7 @@ def test_invalid_suffixtype(self): expected = pd.DataFrame(exp_data).astype({"year": "int"}) expected = expected.set_index(["id", "year"]) - expected.index.set_levels([0, 1], level=0, inplace=True) + expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1ba73292dc0b4..724558bd49ea2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -63,8 +63,8 @@ def setup_method(self, method): ).sum() # use Int64Index, to make sure things work - self.ymd.index.set_levels( - [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True + self.ymd.index = self.ymd.index.set_levels( + [lev.astype("i8") for lev in self.ymd.index.levels] ) self.ymd.index.set_names(["year", "month", "day"], inplace=True) From 25601734b19ec19aee31054846386a88d9c179b3 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 10 Aug 2020 14:24:29 +0100 Subject: [PATCH 20/51] REF: Simplify Index.copy (#35592) --- pandas/core/dtypes/common.py | 28 +++++++++++++++++- pandas/core/indexes/base.py | 36 +++++++++++++++--------- pandas/core/indexes/range.py | 5 ++-- pandas/core/series.py | 4 +-- pandas/tests/dtypes/test_common.py | 10 +++++++ pandas/tests/indexes/common.py | 14 +++++++++ pandas/tests/indexes/multi/test_names.py | 7 +++++ 7 files changed, 84 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 73109020b1b54..1e70ff90fcd44 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,7 +9,7 @@ from pandas._libs import Interval, Period, algos from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, DtypeObj +from pandas._typing import ArrayLike, DtypeObj, Optional from pandas.core.dtypes.base import registry from pandas.core.dtypes.dtypes import ( @@ -1732,6 +1732,32 @@ def _validate_date_like_dtype(dtype) -> None: ) +def validate_all_hashable(*args, error_name: Optional[str] = None) -> None: + """ + Return None if all args are hashable, else raise a TypeError. + + Parameters + ---------- + *args + Arguments to validate. + error_name : str, optional + The name to use if error + + Raises + ------ + TypeError : If an argument is not hashable + + Returns + ------- + None + """ + if not all(is_hashable(arg) for arg in args): + if error_name: + raise TypeError(f"{error_name} must be a hashable type") + else: + raise TypeError("All elements must be hashable") + + def pandas_dtype(dtype) -> DtypeObj: """ Convert input into a pandas only dtype object or a numpy dtype object. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bd75a064b483e..ecd3670e724a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -58,6 +58,7 @@ is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, + validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -812,13 +813,11 @@ def copy(self, name=None, deep=False, dtype=None, names=None): In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ + name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: - new_index = self._shallow_copy(self._data.copy()) + new_index = self._shallow_copy(self._data.copy(), name=name) else: - new_index = self._shallow_copy() - - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) + new_index = self._shallow_copy(name=name) if dtype: new_index = new_index.astype(dtype) @@ -1186,7 +1185,7 @@ def name(self, value): maybe_extract_name(value, None, type(self)) self._name = value - def _validate_names(self, name=None, names=None, deep: bool = False): + def _validate_names(self, name=None, names=None, deep: bool = False) -> List[Label]: """ Handles the quirks of having a singular 'name' parameter for general Index and plural 'names' parameter for MultiIndex. @@ -1196,15 +1195,25 @@ def _validate_names(self, name=None, names=None, deep: bool = False): if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: - return deepcopy(self.names) if deep else self.names + new_names = deepcopy(self.names) if deep else self.names elif names is not None: if not is_list_like(names): raise TypeError("Must pass list-like as `names`.") - return names + new_names = names + elif not is_list_like(name): + new_names = [name] else: - if not is_list_like(name): - return [name] - return name + new_names = name + + if len(new_names) != len(self.names): + raise ValueError( + f"Length of new names must be {len(self.names)}, got {len(new_names)}" + ) + + # All items in 'new_names' need to be hashable + validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") + + return new_names def _get_names(self): return FrozenList((self.name,)) @@ -1232,9 +1241,8 @@ def _set_names(self, values, level=None): # GH 20527 # All items in 'name' need to be hashable: - for name in values: - if not is_hashable(name): - raise TypeError(f"{type(self).__name__}.name must be a hashable type") + validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") + self._name = values[0] names = property(fset=_set_names, fget=_get_names) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e9c4c301f4dca..3577a7aacc008 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -388,9 +388,8 @@ def _shallow_copy(self, values=None, name: Label = no_default): def copy(self, name=None, deep=False, dtype=None, names=None): self._validate_dtype(dtype) - new_index = self._shallow_copy() - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) + name = self._validate_names(name=name, names=names, deep=deep)[0] + new_index = self._shallow_copy(name=name) return new_index def _minmax(self, meth: str): diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e70120f67969..93368ea1e515f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,6 +54,7 @@ is_list_like, is_object_dtype, is_scalar, + validate_all_hashable, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -491,8 +492,7 @@ def name(self) -> Label: @name.setter def name(self, value: Label) -> None: - if not is_hashable(value): - raise TypeError("Series.name must be a hashable type") + validate_all_hashable(value, error_name=f"{type(self).__name__}.name") object.__setattr__(self, "_name", value) @property diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ce12718e48d0d..a6c526fcb008a 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -746,3 +746,13 @@ def test_astype_object_preserves_datetime_na(from_type): result = astype_nansafe(arr, dtype="object") assert isna(result)[0] + + +def test_validate_allhashable(): + assert com.validate_all_hashable(1, "a") is None + + with pytest.raises(TypeError, match="All elements must be hashable"): + com.validate_all_hashable([]) + + with pytest.raises(TypeError, match="list must be a hashable type"): + com.validate_all_hashable([], error_name="list") diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 238ee8d304d05..98f7c0eadb4bb 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -270,6 +270,20 @@ def test_copy_name(self, index): s3 = s1 * s2 assert s3.index.name == "mario" + def test_name2(self, index): + # gh-35592 + if isinstance(index, MultiIndex): + return + + assert index.copy(name="mario").name == "mario" + + with pytest.raises(ValueError, match="Length of new names must be 1, got 2"): + index.copy(name=["mario", "luigi"]) + + msg = f"{type(index).__name__}.name must be a hashable type" + with pytest.raises(TypeError, match=msg): + index.copy(name=[["mario"]]) + def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 479b5ef0211a0..f38da7ad2ae1c 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -75,6 +75,13 @@ def test_copy_names(): assert multi_idx.names == ["MyName1", "MyName2"] assert multi_idx3.names == ["NewName1", "NewName2"] + # gh-35592 + with pytest.raises(ValueError, match="Length of new names must be 2, got 1"): + multi_idx.copy(names=["mario"]) + + with pytest.raises(TypeError, match="MultiIndex.name must be a hashable type"): + multi_idx.copy(names=[["mario"], ["luigi"]]) + def test_names(idx, index_names): From 9a8152c95c8c61a0cdf33c64b80bbc9f2f9d27b6 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 10 Aug 2020 23:30:12 +1000 Subject: [PATCH 21/51] =?UTF-8?q?BUG:=20Fix=20assert=5Fequal=20when=20chec?= =?UTF-8?q?k=5Fexact=3DTrue=20for=20non-numeric=20dtypes=20#3=E2=80=A6=20(?= =?UTF-8?q?#35522)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/_testing.py | 6 ++---- pandas/tests/util/test_assert_series_equal.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 7f5182e3eaa6f..e5860644fa371 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) diff --git a/pandas/_testing.py b/pandas/_testing.py index a020fbff3553a..713f29466f097 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1339,10 +1339,8 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact: - if not is_numeric_dtype(left.dtype): - raise AssertionError("check_exact may only be used with numeric Series") - + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1284cc9d4f49b..a7b5aeac560e4 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -281,3 +281,18 @@ class MySeries(Series): with pytest.raises(AssertionError, match="Series classes are different"): tm.assert_series_equal(s3, s1, check_series_type=True) + + +def test_series_equal_exact_for_nonnumeric(): + # https://github.com/pandas-dev/pandas/issues/35446 + s1 = Series(["a", "b"]) + s2 = Series(["a", "b"]) + s3 = Series(["b", "a"]) + + tm.assert_series_equal(s1, s2, check_exact=True) + tm.assert_series_equal(s2, s1, check_exact=True) + + with pytest.raises(AssertionError): + tm.assert_series_equal(s1, s3, check_exact=True) + with pytest.raises(AssertionError): + tm.assert_series_equal(s3, s1, check_exact=True) From df1d440c4a0c583b06d17a885d4589b9edaa840b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 11:19:25 -0400 Subject: [PATCH 22/51] Storage options (#35381) --- doc/source/user_guide/io.rst | 61 +++++++++++++--- doc/source/whatsnew/v1.2.0.rst | 14 ++++ pandas/_typing.py | 3 + pandas/conftest.py | 22 ++++++ pandas/core/frame.py | 37 +++++++++- pandas/core/generic.py | 44 +++++++++++- pandas/core/series.py | 20 +++++- pandas/io/common.py | 24 +++++-- pandas/io/feather_format.py | 25 +++++-- pandas/io/formats/csvs.py | 9 ++- pandas/io/json/_json.py | 28 ++++++-- pandas/io/parquet.py | 48 +++++++++++-- pandas/io/parsers.py | 5 +- pandas/io/pickle.py | 34 +++++++-- pandas/io/sas/sas_xport.py | 2 +- pandas/io/stata.py | 53 +++++++++++--- pandas/tests/io/test_fsspec.py | 125 ++++++++++++++++++++++++++++++++- pandas/tests/io/test_s3.py | 2 +- 18 files changed, 502 insertions(+), 54 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ab233f653061a..35403b5c8b66f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1649,29 +1649,72 @@ options include: Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. -Reading remote files -'''''''''''''''''''' +.. _io.remote: + +Reading/writing remote files +'''''''''''''''''''''''''''' -You can pass in a URL to a CSV file: +You can pass in a URL to read or write remote files to many of Pandas' IO +functions - the following example shows reading a CSV file: .. code-block:: python df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', sep='\t') -S3 URLs are handled as well but require installing the `S3Fs +All URLs which are not local files or HTTP(s) are handled by +`fsspec`_, if installed, and its various filesystem implementations +(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...). +Some of these implementations will require additional packages to be +installed, for example +S3 URLs require the `s3fs `_ library: .. code-block:: python - df = pd.read_csv('s3://pandas-test/tips.csv') + df = pd.read_json('s3://pandas-test/adatafile.json') + +When dealing with remote storage systems, you might need +extra configuration with environment variables or config files in +special locations. For example, to access data in your S3 bucket, +you will need to define credentials in one of the several ways listed in +the `S3Fs documentation +`_. The same is true +for several of the storage backends, and you should follow the links +at `fsimpl1`_ for implementations built into ``fsspec`` and `fsimpl2`_ +for those not included in the main ``fsspec`` +distribution. + +You can also pass parameters directly to the backend driver. For example, +if you do *not* have S3 credentials, you can still access public data by +specifying an anonymous connection, such as + +.. versionadded:: 1.2.0 + +.. code-block:: python + + pd.read_csv("s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"anon": True}) + +``fsspec`` also allows complex URLs, for accessing data in compressed +archives, local caching of files, and more. To locally cache the above +example, you would modify the call to + +.. code-block:: python -If your S3 bucket requires credentials you will need to set them as environment -variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs -documentation on credentials -`_. + pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" + "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"s3": {"anon": True}}) +where we specify that the "anon" parameter is meant for the "s3" part of +the implementation, not to the caching implementation. Note that this caches to a temporary +directory for the duration of the session only, but you can also specify +a permanent store. +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ +.. _fsimpl1: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _fsimpl2: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations Writing out data '''''''''''''''' diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d3bccada09c29..94bb265c32e4c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,20 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +Passing arguments to fsspec backends +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Many read/write functions have acquired the ``storage_options`` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends (detailed from the fsspec docs for +`builtin implementations`_ and linked to `external ones`_). See +Section :ref:`io.remote`. + +.. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + .. _whatsnew_120.binary_handle_to_csv: Support for binary file handles in ``to_csv`` diff --git a/pandas/_typing.py b/pandas/_typing.py index 76ec527e6e258..47a102ddc70e0 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -106,3 +106,6 @@ List[AggFuncTypeBase], Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], ] + +# for arbitrary kwargs passed during reading/writing files +StorageOptions = Optional[Dict[str, Any]] diff --git a/pandas/conftest.py b/pandas/conftest.py index c1925b4f5ca3b..97cc514e31bb3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1224,3 +1224,25 @@ def sort_by_key(request): Tests None (no key) and the identity key. """ return request.param + + +@pytest.fixture() +def fsspectest(): + pytest.importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry + + class TestMemoryFS(MemoryFileSystem): + protocol = "testmem" + test = [None] + + def __init__(self, **kwargs): + self.test[0] = kwargs.pop("test", None) + super().__init__(**kwargs) + + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b66b6b92336f2..9d0751fcce460 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -55,6 +55,7 @@ Label, Level, Renamer, + StorageOptions, ValueKeyFunc, ) from pandas.compat import PY37 @@ -2058,6 +2059,7 @@ def to_stata( version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2134,6 +2136,16 @@ def to_stata( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 + Raises ------ NotImplementedError @@ -2194,6 +2206,7 @@ def to_stata( write_index=write_index, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, **kwargs, ) writer.write_file() @@ -2246,9 +2259,10 @@ def to_feather(self, path, **kwargs) -> None: ) def to_markdown( self, - buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + buf: Optional[Union[IO[str], str]] = None, + mode: str = "wt", index: bool = True, + storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: if "showindex" in kwargs: @@ -2266,9 +2280,14 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + buf, _, _, should_close = get_filepath_or_buffer( + buf, mode=mode, storage_options=storage_options + ) assert buf is not None # Help mypy. + assert not isinstance(buf, str) buf.writelines(result) + if should_close: + buf.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") @@ -2279,6 +2298,7 @@ def to_parquet( compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, + storage_options: StorageOptions = None, **kwargs, ) -> None: """ @@ -2327,6 +2347,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. @@ -2373,6 +2403,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fcb7e2a949205..520023050d49d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ Label, Level, Renamer, + StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, @@ -2058,6 +2059,7 @@ def to_json( compression: Optional[str] = "infer", index: bool_t = True, indent: Optional[int] = None, + storage_options: StorageOptions = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2141,6 +2143,16 @@ def to_json( .. versionadded:: 1.0.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- None or str @@ -2319,6 +2331,7 @@ def to_json( compression=compression, index=index, indent=indent, + storage_options=storage_options, ) def to_hdf( @@ -2633,6 +2646,7 @@ def to_pickle( path, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. @@ -2653,6 +2667,16 @@ def to_pickle( .. [1] https://docs.python.org/3/library/pickle.html. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -2686,7 +2710,13 @@ def to_pickle( """ from pandas.io.pickle import to_pickle - to_pickle(self, path, compression=compression, protocol=protocol) + to_pickle( + self, + path, + compression=compression, + protocol=protocol, + storage_options=storage_options, + ) def to_clipboard( self, excel: bool_t = True, sep: Optional[str] = None, **kwargs @@ -3031,6 +3061,7 @@ def to_csv( escapechar: Optional[str] = None, decimal: Optional[str] = ".", errors: str = "strict", + storage_options: StorageOptions = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3142,6 +3173,16 @@ def to_csv( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- None or str @@ -3194,6 +3235,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, decimal=decimal, + storage_options=storage_options, ) formatter.save() diff --git a/pandas/core/series.py b/pandas/core/series.py index 93368ea1e515f..e8bf87a39b572 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -31,6 +31,7 @@ FrameOrSeriesUnion, IndexKeyFunc, Label, + StorageOptions, ValueKeyFunc, ) from pandas.compat.numpy import function as nv @@ -1422,8 +1423,9 @@ def to_string( def to_markdown( self, buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + mode: str = "wt", index: bool = True, + storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: """ @@ -1436,12 +1438,22 @@ def to_markdown( buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. mode : str, optional - Mode in which file is opened. + Mode in which file is opened, "wt" by default. index : bool, optional, default True Add index (row) labels. .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs These parameters will be passed to `tabulate \ `_. @@ -1477,7 +1489,9 @@ def to_markdown( | 3 | quetzal | +----+----------+ """ - return self.to_frame().to_markdown(buf, mode, index, **kwargs) + return self.to_frame().to_markdown( + buf, mode, index, storage_options=storage_options, **kwargs + ) # ---------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 34e4425c657f1..9ac642e58b544 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -29,7 +29,7 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency @@ -162,7 +162,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -175,8 +175,16 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: dict, optional - passed on to fsspec, if using it; this is not yet accessed by the public API + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 Returns ------- @@ -188,6 +196,10 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -242,6 +254,10 @@ def get_filepath_or_buffer( ).open() return file_obj, encoding, compression, True + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dfa43942fc8b3..2c664e73b9463 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -4,10 +4,10 @@ from pandas import DataFrame, Int64Index, RangeIndex -from pandas.io.common import get_filepath_or_buffer, stringify_path +from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, **kwargs): +def to_feather(df: DataFrame, path, storage_options=None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -15,6 +15,17 @@ def to_feather(df: DataFrame, path, **kwargs): ---------- df : DataFrame path : string file path, or file-like object + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. @@ -23,7 +34,9 @@ def to_feather(df: DataFrame, path, **kwargs): import_optional_dependency("pyarrow") from pyarrow import feather - path = stringify_path(path) + path, _, _, should_close = get_filepath_or_buffer( + path, mode="wb", storage_options=storage_options + ) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -64,7 +77,7 @@ def to_feather(df: DataFrame, path, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True): +def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): """ Load a feather-format object from the file path. @@ -98,7 +111,9 @@ def read_feather(path, columns=None, use_threads: bool = True): import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer( + path, storage_options=storage_options + ) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index b10946a20d041..6eceb94387171 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,7 +11,7 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -53,6 +53,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", + storage_options: StorageOptions = None, ): self.obj = obj @@ -63,7 +64,11 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, ) self.sep = sep self.na_rep = na_rep diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0b06a26d4aa3c..0d2b351926343 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -9,7 +9,7 @@ import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import JSONSerializable +from pandas._typing import JSONSerializable, StorageOptions from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -44,6 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, + storage_options: StorageOptions = None, ): if not index and orient not in ["split", "table"]: @@ -52,8 +53,11 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, compression=compression, mode="w" + path_or_buf, _, _, should_close = get_filepath_or_buffer( + path_or_buf, + compression=compression, + mode="wt", + storage_options=storage_options, ) if lines and orient != "records": @@ -97,6 +101,8 @@ def to_json( return s else: path_or_buf.write(s) + if should_close: + path_or_buf.close() class Writer: @@ -365,6 +371,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, + storage_options: StorageOptions = None, ): """ Convert a JSON string to pandas object. @@ -510,6 +517,16 @@ def read_json( .. versionadded:: 1.1 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- Series or DataFrame @@ -592,7 +609,10 @@ def read_json( compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression + path_or_buf, + encoding=encoding, + compression=compression, + storage_options=storage_options, ) json_reader = JsonReader( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 8c4b63767ac06..7f0eef039a1e8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,7 +3,7 @@ from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -89,6 +89,7 @@ def write( path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, + storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -105,9 +106,13 @@ def write( import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) kwargs["filesystem"] = fs else: + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path @@ -122,14 +127,20 @@ def write( # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs) - def read(self, path, columns=None, **kwargs): + def read( + self, path, columns=None, storage_options: StorageOptions = None, **kwargs, + ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: + if storage_options: + raise ValueError( + "storage_options passed with buffer or non-fsspec filepath" + ) fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) @@ -163,6 +174,7 @@ def write( compression="snappy", index=None, partition_cols=None, + storage_options: StorageOptions = None, **kwargs, ): self.validate_dataframe(df) @@ -185,8 +197,14 @@ def write( fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. - kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() + kwargs["open_with"] = lambda path, _: fsspec.open( + path, "wb", **(storage_options or {}) + ).open() else: + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): @@ -199,11 +217,15 @@ def write( **kwargs, ) - def read(self, path, columns=None, **kwargs): + def read( + self, path, columns=None, storage_options: StorageOptions = None, **kwargs, + ): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") - open_with = lambda path, _: fsspec.open(path, "rb").open() + open_with = lambda path, _: fsspec.open( + path, "rb", **(storage_options or {}) + ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) @@ -218,6 +240,7 @@ def to_parquet( engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, + storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -261,6 +284,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + kwargs Additional keyword arguments passed to the engine """ @@ -273,6 +306,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d4f346f8c1087..9dc0e1f71d13b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -420,6 +420,7 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" encoding = kwds.get("encoding", None) + storage_options = kwds.get("storage_options", None) if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding @@ -432,7 +433,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression + filepath_or_buffer, encoding, compression, storage_options=storage_options ) kwds["compression"] = compression @@ -595,6 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + storage_options=None, ): # gh-23761 # @@ -681,6 +683,7 @@ def read_csv( mangle_dupe_cols=mangle_dupe_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, + storage_options=storage_options, ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3b35b54a6dc16..549d55e65546d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -14,6 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: StorageOptions = None, ): """ Pickle (serialize) object to file. @@ -42,6 +43,16 @@ def to_pickle( protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + .. [1] https://docs.python.org/3/library/pickle.html See Also @@ -76,7 +87,10 @@ def to_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, mode="wb" + filepath_or_buffer, + compression=compression, + mode="wb", + storage_options=storage_options, ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None @@ -97,7 +111,9 @@ def to_pickle( def read_pickle( - filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + storage_options: StorageOptions = None, ): """ Load pickled pandas object (or any object) from file. @@ -121,6 +137,16 @@ def read_pickle( compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- unpickled : same type as object stored in file @@ -162,7 +188,7 @@ def read_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression + filepath_or_buffer, compression=compression, storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 7fc1bc6d3eb6c..6cf248b748107 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,7 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, ): self._encoding = encoding diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cb23b781a7ad2..7a25617885839 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -11,7 +11,7 @@ """ from collections import abc import datetime -from io import BytesIO, IOBase +from io import BytesIO import os from pathlib import Path import struct @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, Label, StorageOptions from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1035,6 +1035,7 @@ def __init__( columns: Optional[Sequence[str]] = None, order_categoricals: bool = True, chunksize: Optional[int] = None, + storage_options: StorageOptions = None, ): super().__init__() self.col_sizes: List[int] = [] @@ -1068,13 +1069,16 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") - elif isinstance(path_or_buf, IOBase): + elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding - contents = path_or_buf.read() + pb: Any = path_or_buf + contents = pb.read() self.path_or_buf = BytesIO(contents) self._read_header() @@ -1906,6 +1910,7 @@ def read_stata( order_categoricals: bool = True, chunksize: Optional[int] = None, iterator: bool = False, + storage_options: StorageOptions = None, ) -> Union[DataFrame, StataReader]: reader = StataReader( @@ -1918,6 +1923,7 @@ def read_stata( columns=columns, order_categoricals=order_categoricals, chunksize=chunksize, + storage_options=storage_options, ) if iterator or chunksize: @@ -1931,7 +1937,9 @@ def read_stata( def _open_file_binary_write( - fname: FilePathOrBuffer, compression: Union[str, Mapping[str, str], None], + fname: FilePathOrBuffer, + compression: Union[str, Mapping[str, str], None], + storage_options: StorageOptions = None, ) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: """ Open a binary file or no-op if file-like. @@ -1943,6 +1951,16 @@ def _open_file_binary_write( compression : {str, dict, None} The compression method to use. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- file : file-like object @@ -1961,7 +1979,10 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, compression=compression_typ + fname, + mode="wb", + compression=compression_typ, + storage_options=storage_options, ) if compression_typ is not None: compression = compression_args @@ -2158,6 +2179,16 @@ class StataWriter(StataParser): .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- writer : StataWriter instance @@ -2207,6 +2238,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates @@ -2219,6 +2251,7 @@ def __init__( self._output_file: Optional[BinaryIO] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) + self.storage_options = storage_options if byteorder is None: byteorder = sys.byteorder @@ -2505,7 +2538,7 @@ def _encode_strings(self) -> None: def write_file(self) -> None: self._file, self._own_file, compression = _open_file_binary_write( - self._fname, self._compression + self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: self._output_file = self._file @@ -3088,6 +3121,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ): # Copy to new list since convert_strl might be modified later self._convert_strl: List[Label] = [] @@ -3104,6 +3138,7 @@ def __init__( data_label=data_label, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, ) self._map: Dict[str, int] = {} self._strl_blob = b"" @@ -3491,6 +3526,7 @@ def __init__( convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ): if version is None: version = 118 if data.shape[1] <= 32767 else 119 @@ -3513,6 +3549,7 @@ def __init__( variable_labels=variable_labels, convert_strl=convert_strl, compression=compression, + storage_options=storage_options, ) # Override version set in StataWriter117 init self._dta_version = version diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a0723452ccb70..3e89f6ca4ae16 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,7 +1,18 @@ +import io + import numpy as np import pytest -from pandas import DataFrame, date_range, read_csv, read_parquet +from pandas import ( + DataFrame, + date_range, + read_csv, + read_feather, + read_json, + read_parquet, + read_pickle, + read_stata, +) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -63,6 +74,16 @@ def test_to_csv(cleared_fs): tm.assert_frame_equal(df1, df2) +def test_csv_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_csv( + "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False + ) + assert fsspectest.test[0] == "csv_write" + read_csv("testmem://test/test.csv", storage_options={"test": "csv_read"}) + assert fsspectest.test[0] == "csv_read" + + @td.skip_if_no("fastparquet") def test_to_parquet_new_file(monkeypatch, cleared_fs): """Regression test for writing to a not-yet-existent GCS Parquet file.""" @@ -71,6 +92,44 @@ def test_to_parquet_new_file(monkeypatch, cleared_fs): ) +@td.skip_if_no("pyarrow") +def test_arrowparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="pyarrow", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="pyarrow", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + +@td.skip_if_no("fastparquet") +def test_fastparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="fastparquet", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="fastparquet", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + @td.skip_if_no("s3fs") def test_from_s3_csv(s3_resource, tips_file): tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) @@ -101,3 +160,67 @@ def test_not_present_exception(): with pytest.raises(ImportError) as e: read_csv("memory://test/test.csv") assert "fsspec library is required" in str(e.value) + + +@td.skip_if_no("pyarrow") +def test_feather_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) + assert fsspectest.test[0] == "feather_write" + out = read_feather("testmem://afile", storage_options={"test": "feather_read"}) + assert fsspectest.test[0] == "feather_read" + tm.assert_frame_equal(df, out) + + +def test_pickle_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"}) + assert fsspectest.test[0] == "pickle_write" + out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"}) + assert fsspectest.test[0] == "pickle_read" + tm.assert_frame_equal(df, out) + + +def test_json_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_json("testmem://afile", storage_options={"test": "json_write"}) + assert fsspectest.test[0] == "json_write" + out = read_json("testmem://afile", storage_options={"test": "json_read"}) + assert fsspectest.test[0] == "json_read" + tm.assert_frame_equal(df, out) + + +def test_stata_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_stata( + "testmem://afile", storage_options={"test": "stata_write"}, write_index=False + ) + assert fsspectest.test[0] == "stata_write" + out = read_stata("testmem://afile", storage_options={"test": "stata_read"}) + assert fsspectest.test[0] == "stata_read" + tm.assert_frame_equal(df, out.astype("int64")) + + +@td.skip_if_no("tabulate") +def test_markdown_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) + assert fsspectest.test[0] == "md_write" + assert fsspectest.cat("afile") + + +@td.skip_if_no("pyarrow") +def test_non_fsspec_options(): + with pytest.raises(ValueError, match="storage_options"): + read_csv("localfile", storage_options={"a": True}) + with pytest.raises(ValueError, match="storage_options"): + # separate test for parquet, which has a different code path + read_parquet("localfile", storage_options={"a": True}) + by = io.BytesIO() + + with pytest.raises(ValueError, match="storage_options"): + read_csv(by, storage_options={"a": True}) + + df = DataFrame({"a": [0]}) + with pytest.raises(ValueError, match="storage_options"): + df.to_parquet("nonfsspecpath", storage_options={"a": True}) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 5e0f7edf4d8ae..a137e76b1696b 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -32,7 +32,7 @@ def test_read_without_creds_from_pub_bucket(): @tm.network @td.skip_if_no("s3fs") -def test_read_with_creds_from_pub_bucke(): +def test_read_with_creds_from_pub_bucket(): # Ensure we can read from a public bucket with credentials # GH 34626 # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt From 32abe63995c721bc6b7494c815bf7c6b714bb918 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 10 Aug 2020 23:52:08 +0100 Subject: [PATCH 23/51] REF/PERF: Move MultiIndex._tuples to MultiIndex._cache (#35641) --- pandas/core/indexes/multi.py | 20 ++++++---------- pandas/io/pytables.py | 8 +++---- pandas/tests/indexes/multi/test_compat.py | 29 +++++++++++++++++------ 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 13927dede5542..448c2dfe4a29d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -243,7 +243,6 @@ class MultiIndex(Index): _comparables = ["names"] rename = Index.set_names - _tuples = None sortorder: Optional[int] # -------------------------------------------------------------------- @@ -634,16 +633,9 @@ def from_frame(cls, df, sortorder=None, names=None): # -------------------------------------------------------------------- - @property + @cache_readonly def _values(self): # We override here, since our parent uses _data, which we don't use. - return self.values - - @property - def values(self): - if self._tuples is not None: - return self._tuples - values = [] for i in range(self.nlevels): @@ -657,8 +649,12 @@ def values(self): vals = np.array(vals, copy=False) values.append(vals) - self._tuples = lib.fast_zip(values) - return self._tuples + arr = lib.fast_zip(values) + return arr + + @property + def values(self): + return self._values @property def array(self): @@ -737,7 +733,6 @@ def _set_levels( if any(names): self._set_names(names) - self._tuples = None self._reset_cache() def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): @@ -906,7 +901,6 @@ def _set_codes( self._codes = new_codes - self._tuples = None self._reset_cache() def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index aeb7b3e044794..2abc570a04de3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -320,6 +320,10 @@ def read_hdf( mode : {'r', 'r+', 'a'}, default 'r' Mode to use when opening the file. Ignored if path_or_buf is a :class:`pandas.HDFStore`. Default is 'r'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. where : list, optional A list of Term (or convertible) objects. start : int, optional @@ -332,10 +336,6 @@ def read_hdf( Return an iterator object. chunksize : int, optional Number of rows to include in an iteration when using an iterator. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. **kwargs Additional keyword arguments passed to HDFStore. diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index b2500efef9e03..72b5ed0edaa78 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -68,24 +68,33 @@ def test_inplace_mutation_resets_values(): mi1 = MultiIndex(levels=levels, codes=codes) mi2 = MultiIndex(levels=levels2, codes=codes) + + # instantiating MultiIndex should not access/cache _.values + assert "_values" not in mi1._cache + assert "_values" not in mi2._cache + vals = mi1.values.copy() vals2 = mi2.values.copy() - assert mi1._tuples is not None + # accessing .values should cache ._values + assert mi1._values is mi1._cache["_values"] + assert mi1.values is mi1._cache["_values"] + assert isinstance(mi1._cache["_values"], np.ndarray) # Make sure level setting works new_vals = mi1.set_levels(levels2).values tm.assert_almost_equal(vals2, new_vals) - # Non-inplace doesn't kill _tuples [implementation detail] - tm.assert_almost_equal(mi1._tuples, vals) + # Non-inplace doesn't drop _values from _cache [implementation detail] + tm.assert_almost_equal(mi1._cache["_values"], vals) # ...and values is still same too tm.assert_almost_equal(mi1.values, vals) - # Inplace should kill _tuples + # Inplace should drop _values from _cache with tm.assert_produces_warning(FutureWarning): mi1.set_levels(levels2, inplace=True) + assert "_values" not in mi1._cache tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too @@ -95,18 +104,24 @@ def test_inplace_mutation_resets_values(): # Must be 1d array of tuples assert exp_values.shape == (6,) - new_values = mi2.set_codes(codes2).values + + new_mi = mi2.set_codes(codes2) + assert "_values" not in new_mi._cache + new_values = new_mi.values + assert "_values" in new_mi._cache # Not inplace shouldn't change - tm.assert_almost_equal(mi2._tuples, vals2) + tm.assert_almost_equal(mi2._cache["_values"], vals2) # Should have correct values tm.assert_almost_equal(exp_values, new_values) - # ...and again setting inplace should kill _tuples, etc + # ...and again setting inplace should drop _values from _cache, etc with tm.assert_produces_warning(FutureWarning): mi2.set_codes(codes2, inplace=True) + assert "_values" not in mi2._cache tm.assert_almost_equal(mi2.values, new_values) + assert "_values" in mi2._cache def test_ndarray_compat_properties(idx, compat_props): From cac9f28364a3fe050dd01b141abbbf4396312267 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Aug 2020 18:02:11 -0500 Subject: [PATCH 24/51] Doc notes for core team members (#35608) --- doc/source/development/maintaining.rst | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 9f9e9dc2631f3..cd084ab263477 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -132,17 +132,24 @@ respond or self-close their issue if it's determined that the behavior is not a or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. +.. _maintaining.reviewing: + Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team -members. Here are some guidelines to check. +members. But only core-team members can merge pull requets when they're ready. + +Here are some things to check when reviewing a pull request. -* Tests should be in a sensible location. +* Tests should be in a sensible location: in the same file as closely related tests. * New public APIs should be included somewhere in ``doc/source/reference/``. * New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. * User-facing changes should have a whatsnew in the appropriate file. * Regression tests should reference the original GitHub issue number like ``# GH-1234``. +* The pull request should be labeled and assigned the appropriate milestone (the next patch release + for regression fixes and small bug fixes, the next minor milestone otherwise) +* Changes should comply with our :ref:`policies.version`. Cleaning up old issues ---------------------- @@ -189,5 +196,34 @@ being helpful on the issue tracker. The current list of core-team members is at https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _maintaining.merging: + +Merging pull requests +--------------------- + +Only core team members can merge pull requests. We have a few guidelines. + +1. You should typically not self-merge your own pull requests. Exceptions include + things like small changes to fix CI (e.g. pinning a package version). +2. You should not merge pull requests that have an active discussion, or pull + requests that has any ``-1`` votes from a core maintainer. Pandas operates + by consensus. +3. For larger changes, it's good to have a +1 from at least two core team members. + +In addition to the items listed in :ref:`maintaining.closing`, you should verify +that the pull request is assigned the correct milestone. + +Pull requests merged with a patch-release milestone will typically be backported +by our bot. Verify that the bot noticed the merge (it will leave a comment within +a minute typically). If a manual backport is needed please do that, and remove +the "Needs backport" label once you've done it manually. If you forget to assign +a milestone before tagging, you can request the bot to backport it with: + +.. code-block:: console + + @Meeseeksdev backport + + .. _governance documents: https://github.com/pandas-dev/pandas-governance -.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file +.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization From 993ab0825cc4d47f99490a5cf6255bf6730c580f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 10 Aug 2020 17:01:57 -0700 Subject: [PATCH 25/51] BUG: DataFrame.apply with func altering row in-place (#35633) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/apply.py | 2 ++ pandas/tests/frame/apply/test_frame_apply.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index e5860644fa371..415f9e508feb8 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) +- Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6b8d7dc35fe95..6d44cf917a07a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -389,6 +389,8 @@ def series_generator(self): blk = mgr.blocks[0] for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr blk.values = arr ser.name = name yield ser diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 3a32278e2a4b1..538978358c8e7 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1522,3 +1522,22 @@ def test_apply_dtype(self, col): expected = df.dtypes tm.assert_series_equal(result, expected) + + +def test_apply_mutating(): + # GH#35462 case where applied func pins a new BlockManager to a row + df = pd.DataFrame({"a": range(100), "b": range(100, 200)}) + + def func(row): + mgr = row._mgr + row.loc["a"] += 1 + assert row._mgr is not mgr + return row + + expected = df.copy() + expected["a"] += 1 + + result = df.apply(func, axis=1) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, result) From 0639e7f71507fae96bca674003e5904444104ccc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 10 Aug 2020 17:35:38 -0700 Subject: [PATCH 26/51] REF: use consistent pattern in tslibs.vectorized (#35613) --- pandas/_libs/tslibs/vectorized.pyx | 149 +++++++++++++---------------- 1 file changed, 66 insertions(+), 83 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index b23f8255a76ac..c3c78ca54885a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -275,44 +275,38 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t int64_t[:] deltas str typ Py_ssize_t[:] pos - int64_t delta, local_val - - if tz is None or is_utc(tz): - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] - result[i] = normalize_i8_stamp(local_val) + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False + + if is_utc(tz) or tz is None: + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - result[i] = normalize_i8_stamp(local_val) + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + delta - result[i] = normalize_i8_stamp(local_val) else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + deltas[pos[i]] - result[i] = normalize_i8_stamp(local_val) + + for i in range(n): + # TODO: reinstate nogil for use_utc case? + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + result[i] = normalize_i8_stamp(local_val) return result.base # `.base` to access underlying ndarray @@ -339,40 +333,36 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): ndarray[int64_t] trans int64_t[:] deltas intp_t[:] pos - int64_t local_val, delta + int64_t local_val, delta = NPY_NAT str typ int64_t day_nanos = 24 * 3600 * 1_000_000_000 + bint use_utc = False, use_tzlocal = False, use_fixed = False - if tz is None or is_utc(tz): - for i in range(n): - local_val = stamps[i] - if local_val % day_nanos != 0: - return False - + if is_utc(tz) or tz is None: + use_utc = True elif is_tzlocal(tz): - for i in range(n): - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - if local_val % day_nanos != 0: - return False + use_tzlocal = True else: trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + delta - if local_val % day_nanos != 0: - return False + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta else: - pos = trans.searchsorted(stamps) - 1 - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + deltas[pos[i]] - if local_val % day_nanos != 0: - return False + local_val = stamps[i] + deltas[pos[i]] + + if local_val % day_nanos != 0: + return False return True @@ -390,45 +380,38 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): int64_t[:] deltas Py_ssize_t[:] pos npy_datetimestruct dts - int64_t local_val + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False if is_utc(tz) or tz is None: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = get_period_ordinal(&dts, freq) - + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - result[i] = get_period_ordinal(&dts, freq) + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[0], &dts) - result[i] = get_period_ordinal(&dts, freq) + use_fixed = True + delta = deltas[0] else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = get_period_ordinal(&dts, freq) + for i in range(n): + # TODO: reinstate nogil for use_utc case? + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + dt64_to_dtstruct(local_val, &dts) + result[i] = get_period_ordinal(&dts, freq) return result.base # .base to get underlying ndarray From c87e40c23575d28908bbfb806682133e45a15bca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=A1=20Erjavec?= Date: Tue, 11 Aug 2020 02:36:26 +0200 Subject: [PATCH 27/51] [FIX] Handle decimal and thousand separator in 'round_trip' converer (#35377) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 57 +++++++++++- pandas/tests/io/parser/test_c_parser_only.py | 98 ++++++++++++++++++++ 3 files changed, 154 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 94bb265c32e4c..ebc8ebf0dc2f7 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -155,6 +155,7 @@ I/O ^^^ - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) +- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - Plotting diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a195c0daf5271..df8ec68986ccb 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1778,20 +1778,73 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } +/* copy a decimal number string with `decimal`, `tsep` as decimal point + and thousands separator to an equivalent c-locale decimal string (striping + `tsep`, replacing `decimal` with '.'). The returned memory should be free-d + with a call to `free`. +*/ + +char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep) { + const char *p = s; + size_t length = strlen(s); + char *s_copy = malloc(length + 1); + char *dst = s_copy; + // Copy Leading sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy integer part dropping `tsep` + while (isdigit_ascii(*p)) { + *dst++ = *p++; + p += (tsep != '\0' && *p == tsep); + } + // Replace `decimal` with '.' + if (*p == decimal) { + *dst++ = '.'; + p++; + } + // Copy the remainder of the string as is. + strncpy(dst, p, length + 1 - (p - s)); + if (endpos != NULL) + *endpos = (char *)(s + length); + return s_copy; +} + + double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + // 'normalize' representation to C-locale; replace decimal with '.' and + // remove t(housand)sep. + char *endptr; + char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); - - double r = PyOS_string_to_double(p, q, 0); + char *endpc; + double r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = (char *) endptr; + } + } else { + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; + } + } if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); + free(pc); return r; } diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index d76d01904731a..50179fc1ec4b8 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -606,3 +606,101 @@ def test_unix_style_breaks(c_parser_only): result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") expected = DataFrame(columns=["col_1", "col_2", "col_3"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal( + c_parser_only, data, thousands, decimal, float_precision +): + parser = c_parser_only + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + result = parser.read_csv( + StringIO(data), + sep="|", + thousands=thousands, + decimal=decimal, + float_precision=float_precision, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "float_precision", [None, "high", "round_trip"], +) +@pytest.mark.parametrize( + "value,expected", + [ + ("-1,0", -1.0), + ("-1,2e0", -1.2), + ("-1e0", -1.0), + ("+1e0", 1.0), + ("+1e+0", 1.0), + ("+1e-1", 0.1), + ("+,1e1", 1.0), + ("+1,e0", 1.0), + ("-,1e1", -1.0), + ("-1,e0", -1.0), + ("0,1", 0.1), + ("1,", 1.0), + (",1", 0.1), + ("-,1", -0.1), + ("1_,", 1.0), + ("1_234,56", 1234.56), + ("1_234,56e0", 1234.56), + # negative cases; must not parse as float + ("_", "_"), + ("-_", "-_"), + ("-_1", "-_1"), + ("-_1e0", "-_1e0"), + ("_1", "_1"), + ("_1,", "_1,"), + ("_1,_", "_1,_"), + ("_1e0", "_1e0"), + ("1,2e_1", "1,2e_1"), + ("1,2e1_0", "1,2e1_0"), + ("1,_2", "1,_2"), + (",1__2", ",1__2"), + (",1e", ",1e"), + ("-,1e", "-,1e"), + ("1_000,000_000", "1_000,000_000"), + ("1,e1_2", "1,e1_2"), + ], +) +def test_1000_sep_decimal_float_precision( + c_parser_only, value, expected, float_precision +): + # test decimal and thousand sep handling in across 'float_precision' + # parsers + parser = c_parser_only + df = parser.read_csv( + StringIO(value), + sep="|", + thousands="_", + decimal=",", + header=None, + float_precision=float_precision, + ) + val = df.iloc[0, 0] + assert val == expected From 83807088329b2a7e6422e0d0ba460870a265d3d2 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 10 Aug 2020 20:17:47 -0500 Subject: [PATCH 28/51] Drop Python 3.6 support (#35214) * DEPS: drop 3.6 (#34472) * DEPS: drop 3.6 (#34472) * DEPS: fix file name (#34472) * DEPS: fix import (#34472) * DEPS: fix job name (#34472) * DEPS: resolve min version conflicts (#34472) * DEPS: fix env name (#34472) * DEPS: remove py36 check in test, bump matplotlib (#34472) * DEPS: fix travis 37 locale (#34472) * DEPS: remove PY37 check from tests (#34472) * DEPS: remove import (#34472) * DEPS: remove PY37 in benchmark (#34472) * try to fix timeout * pytable version * update minimum version * remove xfail for test apply * remove import * try to fix timeout * try to fix timeout * try to fix timeout * bump to 3.7.1 to fix timeout * migrate ci * fix env name * remove py37-locale from azure * resolve conflicts * update ci * update ci * sync with master * whatsnew and install doc * whatsnew and install doc * update environment.yml * update environment.yml * uncomment azure p37 locale * move min pyarrow test * bumpy numpy to 1.16.5 * bumpy numpy to 1.16.5 * fix 32bit * comment out 32bit CI * update numpy version in numpy/__init__.py * remove import from numpy/__init__.py * filter DeprecationWarning * filter DeprecationWarning * skip unreliable test for windows * skip unreliable test for windows * fix parameter order in docstring * skip test * skip test --- .travis.yml | 4 +- asv_bench/benchmarks/package.py | 24 +- ci/azure/posix.yml | 52 ++--- ci/azure/windows.yml | 12 +- ...zure-36-32bit.yaml => azure-37-32bit.yaml} | 8 +- ci/deps/azure-37-locale.yaml | 8 +- ...le_slow.yaml => azure-37-locale_slow.yaml} | 18 +- ...ns.yaml => azure-37-minimum_versions.yaml} | 17 +- ...{azure-36-slow.yaml => azure-37-slow.yaml} | 2 +- ...re-36-locale.yaml => azure-38-locale.yaml} | 16 +- ...7-numpydev.yaml => azure-38-numpydev.yaml} | 2 +- ...zure-macos-36.yaml => azure-macos-37.yaml} | 6 +- ci/deps/azure-windows-37.yaml | 4 +- ...-windows-36.yaml => azure-windows-38.yaml} | 8 +- ...{travis-36-cov.yaml => travis-37-cov.yaml} | 6 +- ...s-36-locale.yaml => travis-37-locale.yaml} | 10 +- doc/source/getting_started/install.rst | 36 +-- doc/source/whatsnew/v1.2.0.rst | 78 +++++++ environment.yml | 6 +- pandas/__init__.py | 208 +++++------------- pandas/compat/__init__.py | 1 - pandas/compat/numpy/__init__.py | 9 +- pandas/core/frame.py | 5 +- pandas/tests/api/test_api.py | 25 +-- .../arrays/categorical/test_constructors.py | 3 - pandas/tests/extension/arrow/test_bool.py | 4 - pandas/tests/extension/test_numpy.py | 6 - pandas/tests/frame/test_api.py | 13 +- pandas/tests/frame/test_constructors.py | 5 +- pandas/tests/groupby/test_categorical.py | 9 - .../tests/io/json/test_json_table_schema.py | 6 + pandas/tests/io/json/test_pandas.py | 3 + pandas/tests/io/parser/test_common.py | 2 + pandas/tests/scalar/test_nat.py | 4 - pandas/tests/tseries/offsets/test_offsets.py | 11 +- pandas/util/__init__.py | 30 +-- pyproject.toml | 8 +- requirements-dev.txt | 6 +- setup.py | 9 +- 39 files changed, 283 insertions(+), 401 deletions(-) rename ci/deps/{azure-36-32bit.yaml => azure-37-32bit.yaml} (76%) rename ci/deps/{azure-36-locale_slow.yaml => azure-37-locale_slow.yaml} (67%) rename ci/deps/{azure-36-minimum_versions.yaml => azure-37-minimum_versions.yaml} (70%) rename ci/deps/{azure-36-slow.yaml => azure-37-slow.yaml} (96%) rename ci/deps/{azure-36-locale.yaml => azure-38-locale.yaml} (69%) rename ci/deps/{azure-37-numpydev.yaml => azure-38-numpydev.yaml} (96%) rename ci/deps/{azure-macos-36.yaml => azure-macos-37.yaml} (89%) rename ci/deps/{azure-windows-36.yaml => azure-windows-38.yaml} (84%) rename ci/deps/{travis-36-cov.yaml => travis-37-cov.yaml} (93%) rename ci/deps/{travis-36-locale.yaml => travis-37-locale.yaml} (85%) diff --git a/.travis.yml b/.travis.yml index b016cf386098e..2e98cf47aea3e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,7 +45,7 @@ matrix: - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" + - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" services: - mysql - postgresql @@ -54,7 +54,7 @@ matrix: # Enabling Deprecations when running tests # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs # See pandas/_testing.py for more details. - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + - JOB="3.7, coverage" ENV_FILE="ci/deps/travis-37-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql - postgresql diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py index 8ca33db361fa0..34fe4929a752b 100644 --- a/asv_bench/benchmarks/package.py +++ b/asv_bench/benchmarks/package.py @@ -4,22 +4,16 @@ import subprocess import sys -from pandas.compat import PY37 - class TimeImport: def time_import(self): - if PY37: - # on py37+ we the "-X importtime" usage gives us a more precise - # measurement of the import time we actually care about, - # without the subprocess or interpreter overhead - cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] - p = subprocess.run(cmd, stderr=subprocess.PIPE) - - line = p.stderr.splitlines()[-1] - field = line.split(b"|")[-2].strip() - total = int(field) # microseconds - return total + # on py37+ we the "-X importtime" usage gives us a more precise + # measurement of the import time we actually care about, + # without the subprocess or interpreter overhead + cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] + p = subprocess.run(cmd, stderr=subprocess.PIPE) - cmd = [sys.executable, "-c", "import pandas as pd"] - subprocess.run(cmd, stderr=subprocess.PIPE) + line = p.stderr.splitlines()[-1] + field = line.split(b"|")[-2].strip() + total = int(field) # microseconds + return total diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index f716974f6add1..9f8174b4fa678 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -9,20 +9,20 @@ jobs: strategy: matrix: ${{ if eq(parameters.name, 'macOS') }}: - py36_macos: - ENV_FILE: ci/deps/azure-macos-36.yaml - CONDA_PY: "36" + py37_macos: + ENV_FILE: ci/deps/azure-macos-37.yaml + CONDA_PY: "37" PATTERN: "not slow and not network" ${{ if eq(parameters.name, 'Linux') }}: - py36_minimum_versions: - ENV_FILE: ci/deps/azure-36-minimum_versions.yaml - CONDA_PY: "36" + py37_minimum_versions: + ENV_FILE: ci/deps/azure-37-minimum_versions.yaml + CONDA_PY: "37" PATTERN: "not slow and not network and not clipboard" - py36_locale_slow_old_np: - ENV_FILE: ci/deps/azure-36-locale_slow.yaml - CONDA_PY: "36" + py37_locale_slow: + ENV_FILE: ci/deps/azure-37-locale_slow.yaml + CONDA_PY: "37" PATTERN: "slow" # pandas does not use the language (zh_CN), but should support different encodings (utf8) # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any @@ -30,36 +30,36 @@ jobs: LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" - py36_slow: - ENV_FILE: ci/deps/azure-36-slow.yaml - CONDA_PY: "36" + py37_slow: + ENV_FILE: ci/deps/azure-37-slow.yaml + CONDA_PY: "37" PATTERN: "slow" - py36_locale: - ENV_FILE: ci/deps/azure-36-locale.yaml - CONDA_PY: "36" + py37_locale: + ENV_FILE: ci/deps/azure-37-locale.yaml + CONDA_PY: "37" PATTERN: "not slow and not network" LANG: "it_IT.utf8" LC_ALL: "it_IT.utf8" EXTRA_APT: "language-pack-it xsel" - #py36_32bit: - # ENV_FILE: ci/deps/azure-36-32bit.yaml - # CONDA_PY: "36" - # PATTERN: "not slow and not network and not clipboard" - # BITS32: "yes" +# py37_32bit: +# ENV_FILE: ci/deps/azure-37-32bit.yaml +# CONDA_PY: "37" +# PATTERN: "not slow and not network and not clipboard" +# BITS32: "yes" - py37_locale: - ENV_FILE: ci/deps/azure-37-locale.yaml - CONDA_PY: "37" + py38_locale: + ENV_FILE: ci/deps/azure-38-locale.yaml + CONDA_PY: "38" PATTERN: "not slow and not network" LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans xsel" - py37_np_dev: - ENV_FILE: ci/deps/azure-37-numpydev.yaml - CONDA_PY: "37" + py38_np_dev: + ENV_FILE: ci/deps/azure-38-numpydev.yaml + CONDA_PY: "38" PATTERN: "not slow and not network" TEST_ARGS: "-W error" PANDAS_TESTING_MODE: "deprecate" diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 87f1bfd2adb79..5938ba1fd69f5 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -8,16 +8,16 @@ jobs: vmImage: ${{ parameters.vmImage }} strategy: matrix: - py36_np15: - ENV_FILE: ci/deps/azure-windows-36.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" - - py37_np18: + py37_np16: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" PATTERN: "not slow and not network" + py38_np18: + ENV_FILE: ci/deps/azure-windows-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network" + steps: - powershell: | Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-37-32bit.yaml similarity index 76% rename from ci/deps/azure-36-32bit.yaml rename to ci/deps/azure-37-32bit.yaml index 15704cf0d5427..8e0cd73a9536d 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-37-32bit.yaml @@ -3,10 +3,10 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead + ### Cython 0.29.16 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -15,12 +15,12 @@ dependencies: - attrs=19.1.0 - gcc_linux-32 - gxx_linux-32 - - numpy=1.14.* - python-dateutil - - pytz=2017.2 + - pytz=2017.3 # see comment above - pip - pip: - cython>=0.29.16 + - numpy>=1.16.5 - pytest>=5.0.1 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 6f64c81f299d1..a6552aa096a22 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -1,5 +1,6 @@ name: pandas-dev channels: + - defaults - conda-forge dependencies: - python=3.7.* @@ -22,7 +23,7 @@ dependencies: - moto - nomkl - numexpr - - numpy + - numpy=1.16.* - openpyxl - pytables - python-dateutil @@ -32,7 +33,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - - pyarrow>=0.15 - - pip - - pip: - - pyxlsb + - moto diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml similarity index 67% rename from ci/deps/azure-36-locale_slow.yaml rename to ci/deps/azure-37-locale_slow.yaml index c086b3651afc3..3ccb66e09fe7e 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 @@ -16,17 +16,15 @@ dependencies: - beautifulsoup4=4.6.0 - bottleneck=1.2.* - lxml - - matplotlib=2.2.2 - - numpy=1.14.* + - matplotlib=3.0.0 + - numpy=1.16.* - openpyxl=2.5.7 - python-dateutil - python-blosc - - pytz=2017.2 + - pytz=2017.3 - scipy - - sqlalchemy=1.1.4 + - sqlalchemy=1.2.8 - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 - - pip - - pip: - - html5lib==1.0b2 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 + - html5lib=1.0.1 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml similarity index 70% rename from ci/deps/azure-36-minimum_versions.yaml rename to ci/deps/azure-37-minimum_versions.yaml index f5af7bcf36189..94cc5812bcc10 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.6.1 + - python=3.7.1 # tools - cython=0.29.16 @@ -15,16 +15,17 @@ dependencies: # pandas dependencies - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - - jinja2=2.8 + - jinja2=2.10 - numba=0.46.0 - - numexpr=2.6.2 - - numpy=1.15.4 + - numexpr=2.6.8 + - numpy=1.16.5 - openpyxl=2.5.7 - - pytables=3.4.3 + - pytables=3.4.4 - python-dateutil=2.7.3 - - pytz=2017.2 + - pytz=2017.3 + - pyarrow=0.15 - scipy=1.2 - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-37-slow.yaml similarity index 96% rename from ci/deps/azure-36-slow.yaml rename to ci/deps/azure-37-slow.yaml index 87bad59fa4873..e8ffd3d74ca5e 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-38-locale.yaml similarity index 69% rename from ci/deps/azure-36-locale.yaml rename to ci/deps/azure-38-locale.yaml index 3034ed3dc43af..c466a5929ea29 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -1,9 +1,8 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.8.* # tools - cython>=0.29.16 @@ -19,14 +18,12 @@ dependencies: - ipython - jinja2 - lxml - - matplotlib=3.0.* + - matplotlib <3.3.0 + - moto - nomkl - numexpr - - numpy=1.15.* + - numpy - openpyxl - # lowest supported version of pyarrow (putting it here instead of in - # azure-36-minimum_versions because it needs numpy >= 1.14) - - pyarrow=0.13 - pytables - python-dateutil - pytz @@ -35,4 +32,7 @@ dependencies: - xlrd - xlsxwriter - xlwt - - moto + - pyarrow>=0.15 + - pip + - pip: + - pyxlsb diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-38-numpydev.yaml similarity index 96% rename from ci/deps/azure-37-numpydev.yaml rename to ci/deps/azure-38-numpydev.yaml index 5cb58756a6ac1..37592086d49e3 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-38-numpydev.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.7.* + - python=3.8.* # tools - pytest>=5.0.1 diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-37.yaml similarity index 89% rename from ci/deps/azure-macos-36.yaml rename to ci/deps/azure-macos-37.yaml index eeea249a19ca1..a5a69b9a59576 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.6.* + - python=3.7.* # tools - pytest>=5.0.1 @@ -19,9 +19,9 @@ dependencies: - matplotlib=2.2.3 - nomkl - numexpr - - numpy=1.15.4 + - numpy=1.16.5 - openpyxl - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pytables - python-dateutil==2.7.3 - pytz diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5bbd0e2795d7e..4d745454afcab 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -23,9 +23,9 @@ dependencies: - matplotlib=2.2.* - moto - numexpr - - numpy=1.18.* + - numpy=1.16.* - openpyxl - - pyarrow=0.14 + - pyarrow=0.15 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-38.yaml similarity index 84% rename from ci/deps/azure-windows-36.yaml rename to ci/deps/azure-windows-38.yaml index 548660cabaa67..f428a6dadfaa2 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python=3.6.* + - python=3.8.* # tools - cython>=0.29.16 @@ -16,13 +16,13 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 - - matplotlib=3.0.2 + - matplotlib=3.1.3 - numba - numexpr - - numpy=1.15.* + - numpy=1.18.* - openpyxl - jinja2 - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pytables - python-dateutil - pytz diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-37-cov.yaml similarity index 93% rename from ci/deps/travis-36-cov.yaml rename to ci/deps/travis-37-cov.yaml index 177e0d3f4c0af..3a0827a16f97a 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 @@ -26,12 +26,12 @@ dependencies: - moto - nomkl - numexpr - - numpy=1.15.* + - numpy=1.16.* - odfpy - openpyxl - pandas-gbq - psycopg2 - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pymysql - pytables - python-snappy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-37-locale.yaml similarity index 85% rename from ci/deps/travis-36-locale.yaml rename to ci/deps/travis-37-locale.yaml index 03a1e751b6a86..4427c1d940bf2 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 @@ -19,7 +19,7 @@ dependencies: - html5lib - ipython - jinja2 - - lxml=3.8.0 + - lxml=4.3.0 - matplotlib=3.0.* - moto - nomkl @@ -27,14 +27,14 @@ dependencies: - numpy - openpyxl - pandas-gbq=0.12.0 - - psycopg2=2.6.2 + - psycopg2=2.7 - pymysql=0.7.11 - pytables - python-dateutil - pytz - scipy - - sqlalchemy=1.1.4 - - xarray=0.10 + - sqlalchemy=1.3.0 + - xarray=0.12.0 - xlrd - xlsxwriter - xlwt diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b79a9cd872c47..7ab150394bf51 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.6.1 and above, 3.7, and 3.8. +Officially Python 3.7.1 and above, and 3.8. Installing pandas ----------------- @@ -220,9 +220,9 @@ Dependencies Package Minimum supported version ================================================================ ========================== `setuptools `__ 24.2.0 -`NumPy `__ 1.15.4 +`NumPy `__ 1.16.5 `python-dateutil `__ 2.7.3 -`pytz `__ 2017.2 +`pytz `__ 2017.3 ================================================================ ========================== .. _install.recommended_dependencies: @@ -232,7 +232,7 @@ Recommended dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.6.2 or higher. + If installed, must be Version 2.6.8 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, @@ -259,36 +259,36 @@ the method requiring that dependency is called. Dependency Minimum Version Notes ========================= ================== ============================================================= BeautifulSoup4 4.6.0 HTML parser for read_html (see :ref:`note `) -Jinja2 Conditional formatting with DataFrame.style +Jinja2 2.10 Conditional formatting with DataFrame.style PyQt4 Clipboard I/O PyQt5 Clipboard I/O -PyTables 3.4.3 HDF5-based reading / writing -SQLAlchemy 1.1.4 SQL support for databases other than sqlite -SciPy 0.19.0 Miscellaneous statistical functions -XLsxWriter 0.9.8 Excel writing -blosc Compression for HDF5 +PyTables 3.4.4 HDF5-based reading / writing +SQLAlchemy 1.2.8 SQL support for databases other than sqlite +SciPy 1.12.0 Miscellaneous statistical functions +xlsxwriter 1.0.2 Excel writing +blosc 1.14.3 Compression for HDF5 fsspec 0.7.4 Handling files aside from local and HTTP fastparquet 0.3.2 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access -html5lib HTML parser for read_html (see :ref:`note `) -lxml 3.8.0 HTML parser for read_html (see :ref:`note `) -matplotlib 2.2.2 Visualization +html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) +lxml 4.3.0 HTML parser for read_html (see :ref:`note `) +matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.5.7 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access -psycopg2 PostgreSQL engine for sqlalchemy -pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing +psycopg2 2.7 PostgreSQL engine for sqlalchemy +pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading -pytables 3.4.3 HDF5 reading / writing +pytables 3.4.4 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) -xarray 0.8.2 pandas-like API for N-dimensional data +xarray 0.12.0 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.1.0 Excel reading -xlwt 1.2.0 Excel writing +xlwt 1.3.0 Excel writing xsel Clipboard I/O on linux zlib Compression for HDF5 ========================= ================== ============================================================= diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ebc8ebf0dc2f7..86f47a5826214 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -54,6 +54,84 @@ Other enhancements - - +.. _whatsnew_120.api_breaking.python: + +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). + +.. _whatsnew_120.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`35214`). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| numpy | 1.16.5 | X | X | ++-----------------+-----------------+----------+---------+ +| pytz | 2017.3 | X | X | ++-----------------+-----------------+----------+---------+ +| python-dateutil | 2.7.3 | X | | ++-----------------+-----------------+----------+---------+ +| bottleneck | 1.2.1 | | | ++-----------------+-----------------+----------+---------+ +| numexpr | 2.6.8 | | X | ++-----------------+-----------------+----------+---------+ +| pytest (dev) | 5.0.1 | | X | ++-----------------+-----------------+----------+---------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+---------+ +| Package | Minimum Version | Changed | ++=================+=================+=========+ +| beautifulsoup4 | 4.6.0 | | ++-----------------+-----------------+---------+ +| fastparquet | 0.3.2 | | ++-----------------+-----------------+---------+ +| fsspec | 0.7.4 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.6.0 | | ++-----------------+-----------------+---------+ +| lxml | 4.3.0 | X | ++-----------------+-----------------+---------+ +| matplotlib | 2.2.3 | X | ++-----------------+-----------------+---------+ +| numba | 0.46.0 | | ++-----------------+-----------------+---------+ +| openpyxl | 2.5.7 | | ++-----------------+-----------------+---------+ +| pyarrow | 0.15.0 | X | ++-----------------+-----------------+---------+ +| pymysql | 0.7.11 | X | ++-----------------+-----------------+---------+ +| pytables | 3.4.4 | X | ++-----------------+-----------------+---------+ +| s3fs | 0.4.0 | | ++-----------------+-----------------+---------+ +| scipy | 1.2.0 | | ++-----------------+-----------------+---------+ +| sqlalchemy | 1.2.8 | X | ++-----------------+-----------------+---------+ +| xarray | 0.12.0 | X | ++-----------------+-----------------+---------+ +| xlrd | 1.1.0 | | ++-----------------+-----------------+---------+ +| xlsxwriter | 1.0.2 | X | ++-----------------+-----------------+---------+ +| xlwt | 1.3.0 | X | ++-----------------+-----------------+---------+ +| pandas-gbq | 0.12.0 | | ++-----------------+-----------------+---------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index ed9762e5b8893..1e51470d43d36 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: # required # Pin numpy<1.19 until MPL 3.3.0 is released. - - numpy>=1.15,<1.19.0 + - numpy>=1.16.5,<1.19.0 - python=3 - python-dateutil>=2.7.3 - pytz @@ -93,11 +93,11 @@ dependencies: - odfpy - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - pyarrow>=0.15.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow - pyqt>=5.9.2 # pandas.read_clipboard - - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf + - pytables>=3.4.4 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - fsspec>=0.7.4 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path diff --git a/pandas/__init__.py b/pandas/__init__.py index d6584bf4f1c4f..36576da74c75d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -20,7 +20,6 @@ # numpy compat from pandas.compat.numpy import ( - _np_version_under1p16, _np_version_under1p17, _np_version_under1p18, _is_numpy_dev, @@ -185,181 +184,76 @@ __git_version__ = v.get("full-revisionid") del get_versions, v + # GH 27101 # TODO: remove Panel compat in 1.0 -if pandas.compat.PY37: - - def __getattr__(name): - import warnings - - if name == "Panel": - - warnings.warn( - "The Panel class is removed from pandas. Accessing it " - "from the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - class Panel: - pass - - return Panel - - elif name == "datetime": - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime module instead.", - FutureWarning, - stacklevel=2, - ) - - from datetime import datetime as dt - - return dt - - elif name == "np": - - warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - import numpy as np - - return np - - elif name in {"SparseSeries", "SparseDataFrame"}: - warnings.warn( - f"The {name} class is removed from pandas. Accessing it from " - "the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - return type(name, (), {}) - - elif name == "SparseArray": - - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=2, - ) - from pandas.core.arrays.sparse import SparseArray as _SparseArray +def __getattr__(name): + import warnings - return _SparseArray + if name == "Panel": - raise AttributeError(f"module 'pandas' has no attribute '{name}'") + warnings.warn( + "The Panel class is removed from pandas. Accessing it " + "from the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) + class Panel: + pass -else: + return Panel - class Panel: - pass - - class SparseDataFrame: - pass - - class SparseSeries: - pass - - class __numpy: - def __init__(self): - import numpy as np - import warnings - - self.np = np - self.warnings = warnings - - def __getattr__(self, item): - self.warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - - try: - return getattr(self.np, item) - except AttributeError as err: - raise AttributeError(f"module numpy has no attribute {item}") from err - - np = __numpy() - - class __Datetime(type): + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) from datetime import datetime as dt - datetime = dt - - def __getattr__(cls, item): - cls.emit_warning() - - try: - return getattr(cls.datetime, item) - except AttributeError as err: - raise AttributeError( - f"module datetime has no attribute {item}" - ) from err - - def __instancecheck__(cls, other): - return isinstance(other, cls.datetime) - - class __DatetimeSub(metaclass=__Datetime): - def emit_warning(dummy=0): - import warnings - - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime instead.", - FutureWarning, - stacklevel=3, - ) - - def __new__(cls, *args, **kwargs): - cls.emit_warning() - from datetime import datetime as dt - - return dt(*args, **kwargs) - - datetime = __DatetimeSub + return dt - class __SparseArray(type): + elif name == "np": - from pandas.core.arrays.sparse import SparseArray as sa + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np - SparseArray = sa + return np - def __instancecheck__(cls, other): - return isinstance(other, cls.SparseArray) + elif name in {"SparseSeries", "SparseDataFrame"}: + warnings.warn( + f"The {name} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) - class __SparseArraySub(metaclass=__SparseArray): - def emit_warning(dummy=0): - import warnings + return type(name, (), {}) - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=3, - ) + elif name == "SparseArray": - def __new__(cls, *args, **kwargs): - cls.emit_warning() - from pandas.core.arrays.sparse import SparseArray as sa + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray - return sa(*args, **kwargs) + return _SparseArray - SparseArray = __SparseArraySub + raise AttributeError(f"module 'pandas' has no attribute '{name}'") # module level doc-string diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b5a1dc2b2fb94..ab2835932c95d 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,7 +14,6 @@ from pandas._typing import F -PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) PYPY = platform.python_implementation() == "PyPy" diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 789a4668b6fee..08d06da93bb45 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,19 +8,19 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p16 = _nlv < LooseVersion("1.16") _np_version_under1p17 = _nlv < LooseVersion("1.17") _np_version_under1p18 = _nlv < LooseVersion("1.18") _np_version_under1p19 = _nlv < LooseVersion("1.19") _np_version_under1p20 = _nlv < LooseVersion("1.20") _is_numpy_dev = ".dev" in str(_nlv) +_min_numpy_ver = "1.16.5" -if _nlv < "1.15.4": +if _nlv < _min_numpy_ver: raise ImportError( - "this version of pandas is incompatible with numpy < 1.15.4\n" + f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n" f"your numpy version is {_np_version}.\n" - "Please upgrade numpy to >= 1.15.4 to use this pandas version" + f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version" ) @@ -65,7 +65,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = [ "np", "_np_version", - "_np_version_under1p16", "_np_version_under1p17", "_is_numpy_dev", ] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d0751fcce460..547d86f221b5f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -58,7 +58,6 @@ StorageOptions, ValueKeyFunc, ) -from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -1088,9 +1087,7 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python versions before 3.7 support at most 255 arguments to constructors - can_return_named_tuples = PY37 or len(self.columns) + index < 255 - if name is not None and can_return_named_tuples: + if name is not None: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index caa348d3a1fb9..1d25336cd3b70 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -5,7 +5,7 @@ import pytest import pandas as pd -from pandas import api, compat +from pandas import api import pandas._testing as tm @@ -100,11 +100,6 @@ class TestPDApi(Base): # these should be deprecated in the future deprecated_classes_in_future: List[str] = ["SparseArray"] - if not compat.PY37: - classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) - # deprecated_modules.extend(["np", "datetime"]) - # deprecated_classes_in_future.extend(["SparseArray"]) - # external modules exposed in pandas namespace modules: List[str] = [] @@ -193,7 +188,6 @@ class TestPDApi(Base): "_hashtable", "_lib", "_libs", - "_np_version_under1p16", "_np_version_under1p17", "_np_version_under1p18", "_is_numpy_dev", @@ -217,14 +211,6 @@ def test_api(self): + self.funcs_to + self.private_modules ) - if not compat.PY37: - checkthese.extend( - self.deprecated_modules - + self.deprecated_classes - + self.deprecated_classes_in_future - + self.deprecated_funcs_in_future - + self.deprecated_funcs - ) self.check(pd, checkthese, self.ignored) def test_depr(self): @@ -237,14 +223,7 @@ def test_depr(self): ) for depr in deprecated_list: with tm.assert_produces_warning(FutureWarning): - deprecated = getattr(pd, depr) - if not compat.PY37: - if depr == "datetime": - deprecated.__getattr__(dir(pd.datetime.datetime)[-1]) - elif depr == "SparseArray": - deprecated([]) - else: - deprecated.__getattr__(dir(deprecated)[-1]) + _ = getattr(pd, depr) def test_datetime(): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index ca942c9288898..89fbfbd5b8324 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p16 - from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -637,7 +635,6 @@ def test_constructor_imaginary(self): tm.assert_index_equal(c1.categories, Index(values)) tm.assert_numpy_array_equal(np.array(c1), np.array(values)) - @pytest.mark.skipif(_np_version_under1p16, reason="Skipping for NumPy <1.16") def test_constructor_string_and_tuples(self): # GH 21416 c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 7841360e568ed..12426a0c92c55 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat import PY37 - import pandas as pd import pandas._testing as tm from pandas.tests.extension import base @@ -62,13 +60,11 @@ def test_from_dtype(self, data): def test_from_sequence_from_cls(self, data): super().test_from_sequence_from_cls(data) - @pytest.mark.skipif(not PY37, reason="timeout on Linux py36_locale") @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") def test_series_constructor_no_data_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays super().test_series_constructor_no_data_with_index(dtype, na_value) - @pytest.mark.skipif(not PY37, reason="timeout on Linux py36_locale") @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 78000c0252375..b9219f9f833de 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p16 - import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray, PandasDtype @@ -46,11 +44,7 @@ def data(allow_in_pandas, dtype): @pytest.fixture def data_missing(allow_in_pandas, dtype): - # For NumPy <1.16, np.array([np.nan, (1,)]) raises - # ValueError: setting an array element with a sequence. if dtype.numpy_dtype == "object": - if _np_version_under1p16: - raise pytest.skip("Skipping for NumPy <1.16") return PandasArray(np.array([np.nan, (1,)], dtype=object)) return PandasArray(np.array([np.nan, 1.0])) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index cc57a3970d18b..2fb1f7f911a9c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from pandas.compat import PY37 from pandas.util._test_decorators import async_mark, skip_if_no import pandas as pd @@ -274,10 +273,7 @@ def test_itertuples(self, float_frame): # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert isinstance(tup3, tuple) - if PY37: - assert hasattr(tup3, "_fields") - else: - assert not hasattr(tup3, "_fields") + assert hasattr(tup3, "_fields") # GH 28282 df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) @@ -288,12 +284,7 @@ def test_itertuples(self, float_frame): df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) result_255_columns = next(df_255_columns.itertuples(index=False)) assert isinstance(result_255_columns, tuple) - - # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 - if PY37: - assert hasattr(result_255_columns, "_fields") - else: - assert not hasattr(result_255_columns, "_fields") + assert hasattr(result_255_columns, "_fields") def test_sequence_like_with_categorical(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d0f774344a33d..c8f5b2b0f6364 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -10,7 +10,7 @@ import pytest import pytz -from pandas.compat import PY37, is_platform_little_endian +from pandas.compat import is_platform_little_endian from pandas.compat.numpy import _np_version_under1p19 from pandas.core.dtypes.common import is_integer_dtype @@ -1418,7 +1418,6 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples, columns=["y", "z"]) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7") def test_constructor_list_of_dataclasses(self): # GH21910 from dataclasses import make_dataclass @@ -1430,7 +1429,6 @@ def test_constructor_list_of_dataclasses(self): result = DataFrame(datas) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7") def test_constructor_list_of_dataclasses_with_varying_types(self): # GH21910 from dataclasses import make_dataclass @@ -1447,7 +1445,6 @@ def test_constructor_list_of_dataclasses_with_varying_types(self): result = DataFrame(datas) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7") def test_constructor_list_of_dataclasses_error_thrown(self): # GH21910 from dataclasses import make_dataclass diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c74c1529eb537..13a32e285e70a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY37, is_platform_windows - import pandas as pd from pandas import ( Categorical, @@ -13,7 +11,6 @@ Index, MultiIndex, Series, - _np_version_under1p17, qcut, ) import pandas._testing as tm @@ -244,12 +241,6 @@ def test_level_get_group(observed): tm.assert_frame_equal(result, expected) -# GH#21636 flaky on py37; may be related to older numpy, see discussion -# https://github.com/MacPython/pandas-wheels/pull/64 -@pytest.mark.xfail( - PY37 and _np_version_under1p17 and not is_platform_windows(), - reason="Flaky, GH-27902", -) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 22b4ec189a0f1..8f1ed193b100f 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -256,6 +256,9 @@ def test_read_json_from_to_json_results(self): tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, df) + @pytest.mark.filterwarnings( + "ignore:an integer is required (got type float)*:DeprecationWarning" + ) def test_to_json(self): df = self.df.copy() df.index.name = "idx" @@ -432,6 +435,9 @@ def test_to_json_categorical_index(self): assert result == expected + @pytest.mark.filterwarnings( + "ignore:an integer is required (got type float)*:DeprecationWarning" + ) def test_date_format_raises(self): with pytest.raises(ValueError): self.df.to_json(orient="table", date_format="epoch") diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4db0170ecc90..1280d0fd434d5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -35,6 +35,9 @@ def assert_json_roundtrip_equal(result, expected, orient): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:an integer is required (got type float)*:DeprecationWarning" +) @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: @pytest.fixture(autouse=True) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5154a9ba6fdf0..c84c0048cc838 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1138,6 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.skip("unreliable test #35214") def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1151,6 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float +@pytest.mark.skip("unreliable test #35214") def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 03830019affa1..09d5d9c1677d0 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -308,10 +308,6 @@ def test_overlap_public_nat_methods(klass, expected): # In case when Timestamp, Timedelta, and NaT are overlap, the overlap # is considered to be with Timestamp and NaT, not Timedelta. - # "fromisoformat" was introduced in 3.7 - if klass is Timestamp and not compat.PY37: - expected.remove("fromisoformat") - # "fromisocalendar" was introduced in 3.8 if klass is Timestamp and not compat.PY38: expected.remove("fromisocalendar") diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8c51908c547f4..d1ab797056ece 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -14,7 +14,6 @@ import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import ApplyTypeError, _get_offset, _offset_map from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat from pandas.errors import PerformanceWarning @@ -744,10 +743,7 @@ def test_repr(self): assert repr(self.offset) == "" assert repr(self.offset2) == "<2 * BusinessDays>" - if compat.PY37: - expected = "" - else: - expected = "" + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -2636,10 +2632,7 @@ def test_repr(self): assert repr(self.offset) == "" assert repr(self.offset2) == "<2 * CustomBusinessDays>" - if compat.PY37: - expected = "" - else: - expected = "" + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index b5271dbc0443e..9f2bf156b7e37 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,30 +1,12 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa -from pandas import compat from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa -# compatibility for import pandas; pandas.util.testing -if compat.PY37: +def __getattr__(name): + if name == "testing": + import pandas.util.testing - def __getattr__(name): - if name == "testing": - import pandas.util.testing - - return pandas.util.testing - else: - raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") - - -else: - - class _testing: - def __getattr__(self, item): - import pandas.util.testing - - return getattr(pandas.util.testing, item) - - testing = _testing() - - -del compat + return pandas.util.testing + else: + raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") diff --git a/pyproject.toml b/pyproject.toml index f282f2a085000..f6f8081b6c464 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,16 +5,14 @@ requires = [ "setuptools", "wheel", "Cython>=0.29.16,<3", # Note: sync with setup.py - "numpy==1.15.4; python_version=='3.6' and platform_system!='AIX'", - "numpy==1.15.4; python_version=='3.7' and platform_system!='AIX'", + "numpy==1.16.5; python_version=='3.7' and platform_system!='AIX'", "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", - "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", - "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'", + "numpy==1.16.5; python_version=='3.7' and platform_system=='AIX'", "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'", ] [tool.black] -target-version = ['py36', 'py37', 'py38'] +target-version = ['py37', 'py38'] exclude = ''' ( asv_bench/env diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a87b0a99a4f8..66e72641cd5bb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.15,<1.19.0 +numpy>=1.16.5,<1.19.0 python-dateutil>=2.7.3 pytz asv @@ -60,10 +60,10 @@ xlsxwriter xlwt odfpy fastparquet>=0.3.2 -pyarrow>=0.13.1 +pyarrow>=0.15.0 python-snappy pyqt5>=5.9.2 -tables>=3.4.3 +tables>=3.4.4 s3fs>=0.4.0 fsspec>=0.7.4 gcsfs>=0.6.0 diff --git a/setup.py b/setup.py index aebbdbf4d1e96..43d19d525876b 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def is_platform_mac(): return sys.platform == "darwin" -min_numpy_ver = "1.15.4" +min_numpy_ver = "1.16.5" min_cython_ver = "0.29.16" # note: sync with pyproject.toml try: @@ -197,7 +197,6 @@ def build_extensions(self): "Intended Audience :: Science/Research", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Cython", @@ -742,7 +741,7 @@ def setup_package(): setuptools_kwargs = { "install_requires": [ "python-dateutil >= 2.7.3", - "pytz >= 2017.2", + "pytz >= 2017.3", f"numpy >= {min_numpy_ver}", ], "setup_requires": [f"numpy >= {min_numpy_ver}"], @@ -766,11 +765,11 @@ def setup_package(): long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, platforms="any", - python_requires=">=3.6.1", + python_requires=">=3.7.1", extras_require={ "test": [ # sync with setup.cfg minversion & install.rst - "pytest>=4.0.2", + "pytest>=5.0.1", "pytest-xdist", "hypothesis>=3.58", ] From 3c87b019b63ebff3c56acb6402fba5bc6f99671f Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 11 Aug 2020 16:57:56 -0500 Subject: [PATCH 29/51] CI/TST: change skip to xfail #35660 (#35672) --- pandas/tests/io/parser/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c84c0048cc838..3d5f6ae3a4af9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1138,7 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.skip("unreliable test #35214") +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1152,7 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float -@pytest.mark.skip("unreliable test #35214") +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers From 59ffa251269ddb8b561d8d88f631a5db21f660dc Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 Aug 2020 13:22:54 +0100 Subject: [PATCH 30/51] CLN: consistent signatures for equals methods (#35636) --- pandas/_libs/sparse.pyx | 4 ++-- pandas/core/arrays/base.py | 14 ++++++++++---- pandas/core/arrays/categorical.py | 6 ++++-- pandas/core/generic.py | 4 +++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 6 +++--- pandas/core/indexes/interval.py | 7 ++++--- pandas/core/indexes/multi.py | 11 +++++------ pandas/core/indexes/range.py | 2 +- pandas/core/internals/managers.py | 5 ++++- 11 files changed, 38 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 321d7c374d8ec..0c3d8915b749b 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -103,7 +103,7 @@ cdef class IntIndex(SparseIndex): if not monotonic: raise ValueError("Indices must be strictly increasing") - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: if not isinstance(other, IntIndex): return False @@ -399,7 +399,7 @@ cdef class BlockIndex(SparseIndex): if blengths[i] == 0: raise ValueError(f'Zero-length block {i}') - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: if not isinstance(other, BlockIndex): return False diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 921927325a144..d85647edc3b81 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -7,7 +7,7 @@ without warning. """ import operator -from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union, cast import numpy as np @@ -20,7 +20,12 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype +from pandas.core.dtypes.common import ( + is_array_like, + is_dtype_equal, + is_list_like, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -742,7 +747,7 @@ def searchsorted(self, value, side="left", sorter=None): arr = self.astype(object) return arr.searchsorted(value, side=side, sorter=sorter) - def equals(self, other: "ExtensionArray") -> bool: + def equals(self, other: object) -> bool: """ Return if another array is equivalent to this array. @@ -762,7 +767,8 @@ def equals(self, other: "ExtensionArray") -> bool: """ if not type(self) == type(other): return False - elif not self.dtype == other.dtype: + other = cast(ExtensionArray, other) + if not is_dtype_equal(self.dtype, other.dtype): return False elif not len(self) == len(other): return False diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6e5c7bc699962..a28b341669918 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2242,7 +2242,7 @@ def _from_factorized(cls, uniques, original): original.categories.take(uniques), dtype=original.dtype ) - def equals(self, other): + def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. @@ -2254,7 +2254,9 @@ def equals(self, other): ------- bool """ - if self.is_dtype_equal(other): + if not isinstance(other, Categorical): + return False + elif self.is_dtype_equal(other): if self.categories.equals(other.categories): # fastpath to avoid re-coding other_codes = other._codes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 520023050d49d..11147bffa32c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -22,6 +22,7 @@ Tuple, Type, Union, + cast, ) import warnings import weakref @@ -1196,7 +1197,7 @@ def _indexed_same(self, other) -> bool: self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS ) - def equals(self, other): + def equals(self, other: object) -> bool: """ Test whether two objects contain the same elements. @@ -1276,6 +1277,7 @@ def equals(self, other): """ if not (isinstance(other, type(self)) or isinstance(self, type(other))): return False + other = cast(NDFrame, other) return self._mgr.equals(other._mgr) # ------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ecd3670e724a1..623ce68201492 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4176,7 +4176,7 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other: Any) -> bool: + def equals(self, other: object) -> bool: """ Determine if two Index object are equal. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index fb283cbe02954..4990e6a8e20e9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -290,7 +290,7 @@ def _is_dtype_compat(self, other) -> bool: return other - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determine if two CategoricalIndex objects contain the same elements. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8ccdab21339df..6d9d75a69e91d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -24,7 +24,7 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndex, ABCSeries from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray @@ -130,14 +130,14 @@ def __array_wrap__(self, result, context=None): # ------------------------------------------------------------------------ - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. """ if self.is_(other): return True - if not isinstance(other, ABCIndexClass): + if not isinstance(other, Index): return False elif not isinstance(other, type(self)): try: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9548ebbd9c3b2..e8d0a44324cc5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1005,19 +1005,20 @@ def _format_space(self) -> str: def argsort(self, *args, **kwargs) -> np.ndarray: return np.lexsort((self.right, self.left)) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two IntervalIndex objects contain the same elements. """ if self.is_(other): return True - # if we can coerce to an II - # then we can compare + # if we can coerce to an IntervalIndex then we can compare if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False other = Index(other) + if not isinstance(other, IntervalIndex): + return False return ( self.left.equals(other.left) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 448c2dfe4a29d..ffbd03d0c3ba7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3221,7 +3221,7 @@ def truncate(self, before=None, after=None): verify_integrity=False, ) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3264,11 +3264,10 @@ def equals(self, other) -> bool: np.asarray(other.levels[i]._values), other_codes, allow_fill=False ) - # since we use NaT both datetime64 and timedelta64 - # we can have a situation where a level is typed say - # timedelta64 in self (IOW it has other values than NaT) - # but types datetime64 in other (where its all NaT) - # but these are equivalent + # since we use NaT both datetime64 and timedelta64 we can have a + # situation where a level is typed say timedelta64 in self (IOW it + # has other values than NaT) but types datetime64 in other (where + # its all NaT) but these are equivalent if len(self_values) == 0 and len(other_values) == 0: continue diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 3577a7aacc008..6080c32052266 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -433,7 +433,7 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index aa74d173d69b3..371b721f08b27 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1437,7 +1437,10 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) - def equals(self, other: "BlockManager") -> bool: + def equals(self, other: object) -> bool: + if not isinstance(other, BlockManager): + return False + self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False From bf8e9efe0f1ae09dc39e13e2f56c82d503138b06 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 12 Aug 2020 15:14:30 -0700 Subject: [PATCH 31/51] BUG: Support custom BaseIndexers in groupby.rolling (#35647) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/window/indexers.py | 14 ++++++++++---- pandas/core/window/rolling.py | 15 +++++++++++---- pandas/tests/window/test_grouper.py | 23 +++++++++++++++++++++++ 4 files changed, 45 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 415f9e508feb8..cdc244ca193b4 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index bc36bdca982e8..7cbe34cdebf9f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,6 +1,6 @@ """Indexer objects for computing start/end window bounds for rolling operations""" from datetime import timedelta -from typing import Dict, Optional, Tuple, Type, Union +from typing import Dict, Optional, Tuple, Type import numpy as np @@ -265,7 +265,8 @@ def __init__( index_array: Optional[np.ndarray], window_size: int, groupby_indicies: Dict, - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]], + rolling_indexer: Type[BaseIndexer], + indexer_kwargs: Optional[Dict], **kwargs, ): """ @@ -276,7 +277,10 @@ def __init__( """ self.groupby_indicies = groupby_indicies self.rolling_indexer = rolling_indexer - super().__init__(index_array, window_size, **kwargs) + self.indexer_kwargs = indexer_kwargs or {} + super().__init__( + index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs + ) @Appender(get_window_bounds_doc) def get_window_bounds( @@ -298,7 +302,9 @@ def get_window_bounds( else: index_array = self.index_array indexer = self.rolling_indexer( - index_array=index_array, window_size=self.window_size, + index_array=index_array, + window_size=self.window_size, + **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( len(indicies), min_periods, center, closed diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7347d5686aabc..0306d4de2fc73 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -145,7 +145,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj, + obj: FrameOrSeries, window=None, min_periods: Optional[int] = None, center: bool = False, @@ -2271,10 +2271,16 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: ------- GroupbyRollingIndexer """ - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] - if self.is_freq_type: + rolling_indexer: Type[BaseIndexer] + indexer_kwargs: Optional[Dict] = None + index_array = self.obj.index.asi8 + if isinstance(self.window, BaseIndexer): + rolling_indexer = type(self.window) + indexer_kwargs = self.window.__dict__ + # We'll be using the index of each group later + indexer_kwargs.pop("index_array", None) + elif self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2283,6 +2289,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: window_size=window, groupby_indicies=self._groupby.indices, rolling_indexer=rolling_indexer, + indexer_kwargs=indexer_kwargs, ) return window_indexer diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index e1dcac06c39cc..a9590c7e1233a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -305,6 +305,29 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_custom_indexer(self): + # GH 35557 + class SimpleIndexer(pd.api.indexers.BaseIndexer): + def get_window_bounds( + self, num_values=0, min_periods=None, center=None, closed=None + ): + min_periods = self.window_size if min_periods is None else 0 + end = np.arange(num_values, dtype=np.int64) + 1 + start = end.copy() - self.window_size + start[start < 0] = min_periods + return start, end + + df = pd.DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5 + ) + result = ( + df.groupby(df.index) + .rolling(SimpleIndexer(window_size=3), min_periods=1) + .sum() + ) + expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() + tm.assert_frame_equal(result, expected) + def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( From 8145ea690f496c915409096a770d5cb835f2ec3f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 12 Aug 2020 15:15:35 -0700 Subject: [PATCH 32/51] REF: _cython_agg_blocks follow patterns similar to _apply_blockwise (#35632) --- pandas/core/groupby/generic.py | 41 +++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 53242c0332a8c..b7280a9f7db3c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1026,8 +1026,7 @@ def _cython_agg_blocks( if numeric_only: data = data.get_numeric_data(copy=False) - agg_blocks: List[Block] = [] - new_items: List[np.ndarray] = [] + agg_blocks: List["Block"] = [] deleted_items: List[np.ndarray] = [] no_result = object() @@ -1056,11 +1055,12 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: "Block" = block.make_block(result) return agg_block - for block in data.blocks: - # Avoid inheriting result from earlier in the loop + def blk_func(block: "Block") -> List["Block"]: + new_blocks: List["Block"] = [] + result = no_result locs = block.mgr_locs.as_array try: @@ -1076,8 +1076,7 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": # we cannot perform the operation # in an alternate way, exclude the block assert how == "ohlc" - deleted_items.append(locs) - continue + raise # call our grouper again with only this block obj = self.obj[data.items[locs]] @@ -1096,8 +1095,7 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block - deleted_items.append(locs) - continue + raise else: result = cast(DataFrame, result) # unwrap DataFrame to get array @@ -1108,20 +1106,33 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": # clean, we choose to clean up this mess later on. assert len(locs) == result.shape[1] for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_blocks.append(agg_block) + new_blocks.append(agg_block) else: result = result._mgr.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) agg_block = cast_result_block(result, block, how) - new_items.append(locs) - agg_blocks.append(agg_block) + new_blocks = [agg_block] else: agg_block = cast_result_block(result, block, how) - new_items.append(locs) - agg_blocks.append(agg_block) + new_blocks = [agg_block] + return new_blocks + + skipped: List[int] = [] + new_items: List[np.ndarray] = [] + for i, block in enumerate(data.blocks): + try: + nbs = blk_func(block) + except (NotImplementedError, TypeError): + # TypeError -> we may have an exception in trying to aggregate + # continue and exclude the block + # NotImplementedError -> "ohlc" with wrong dtype + skipped.append(i) + deleted_items.append(block.mgr_locs.as_array) + else: + agg_blocks.extend(nbs) + new_items.append(block.mgr_locs.as_array) if not agg_blocks: raise DataError("No numeric types to aggregate") From a0896e107ac03cd667b47e7be2059cd261b65938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 12 Aug 2020 18:23:25 -0400 Subject: [PATCH 33/51] BUG: to_pickle/read_pickle do not close user-provided file objects (#35686) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/pickle.py | 8 ++++++-- pandas/tests/io/test_pickle.py | 9 +++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 86f47a5826214..deb5697053ea8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -234,7 +234,7 @@ I/O - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) -- +- :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) Plotting ^^^^^^^^ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 549d55e65546d..eee6ec7c9feca 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -100,7 +100,9 @@ def to_pickle( try: f.write(pickle.dumps(obj, protocol=protocol)) finally: - f.close() + if f != filepath_or_buffer: + # do not close user-provided file objects GH 35679 + f.close() for _f in fh: _f.close() if should_close: @@ -215,7 +217,9 @@ def read_pickle( # e.g. can occur for files written in py27; see GH#28645 and GH#31988 return pc.load(f, encoding="latin-1") finally: - f.close() + if f != filepath_or_buffer: + # do not close user-provided file objects GH 35679 + f.close() for _f in fh: _f.close() if should_close: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index e4d43db7834e3..6331113ab8945 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -183,6 +183,15 @@ def python_unpickler(path): result = python_unpickler(path) compare_element(result, expected, typ) + # and the same for file objects (GH 35679) + with open(path, mode="wb") as handle: + writer(expected, path) + handle.seek(0) # shouldn't close file handle + with open(path, mode="rb") as handle: + result = pd.read_pickle(handle) + handle.seek(0) # shouldn't close file handle + compare_element(result, expected, typ) + def test_pickle_path_pathlib(): df = tm.makeDataFrame() From 40795053aaf82f8f55abe56001b1276b1ef0a916 Mon Sep 17 00:00:00 2001 From: Yutaro Ikeda Date: Thu, 13 Aug 2020 19:15:51 +0900 Subject: [PATCH 34/51] BUG: GH-35558 merge_asof tolerance error (#35654) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 22 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index cdc244ca193b4..b37103910afab 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 27b331babe692..2349cb1dcc0c7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1667,7 +1667,7 @@ def _get_merge_keys(self): msg = ( f"incompatible tolerance {self.tolerance}, must be compat " - f"with type {repr(lk.dtype)}" + f"with type {repr(lt.dtype)}" ) if needs_i8_conversion(lt): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 9b09f0033715d..895de2b748c34 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1339,3 +1339,25 @@ def test_merge_index_column_tz(self): index=pd.Index([0, 1, 2, 3, 4]), ) tm.assert_frame_equal(result, expected) + + def test_left_index_right_index_tolerance(self): + # https://github.com/pandas-dev/pandas/issues/35558 + dr1 = pd.date_range( + start="1/1/2020", end="1/20/2020", freq="2D" + ) + pd.Timedelta(seconds=0.4) + dr2 = pd.date_range(start="1/1/2020", end="2/1/2020") + + df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1)) + df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2)) + + expected = pd.DataFrame( + {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1) + ) + result = pd.merge_asof( + df1, + df2, + left_index=True, + right_index=True, + tolerance=pd.Timedelta(seconds=0.5), + ) + tm.assert_frame_equal(result, expected) From 288c8ed85c387ff3fef8458ee3ab9a3330645021 Mon Sep 17 00:00:00 2001 From: Elliot Rampono Date: Thu, 13 Aug 2020 12:21:46 -0400 Subject: [PATCH 35/51] Reorganize imports to be compliant with isort (and conventional) (#35708) --- web/pandas_web.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/web/pandas_web.py b/web/pandas_web.py index e62deaa8cdc7f..7dd63175e69ac 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -34,13 +34,12 @@ import time import typing +import feedparser import jinja2 +import markdown import requests import yaml -import feedparser -import markdown - class Preprocessors: """ From 7a675928f23839e9c9ddf3b049da51e5a20ce683 Mon Sep 17 00:00:00 2001 From: Elliot Rampono Date: Thu, 13 Aug 2020 13:52:32 -0400 Subject: [PATCH 36/51] add web/ directory to isort checks (#35709) Co-authored-by: elliot rampono --- ci/code_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 816bb23865c04..852f66763683b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts" + ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts web" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else From e530066aa5b0b6fee333a45a33424efb218d6556 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 13 Aug 2020 19:06:35 +0100 Subject: [PATCH 37/51] PERF: make RangeIndex iterate over ._range (#35676) --- asv_bench/benchmarks/index_object.py | 24 +++++++++++++++-------- pandas/core/indexes/range.py | 4 ++++ pandas/tests/indexes/ranges/test_range.py | 4 ++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index b242de6a17208..9c05019c70396 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -57,8 +57,8 @@ def time_datetime_difference_disjoint(self): class Range: def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3) - self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10 ** 6, step=3) + self.idx_dec = RangeIndex(start=10 ** 6, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -73,15 +73,23 @@ def time_min_trivial(self): self.idx_inc.min() def time_get_loc_inc(self): - self.idx_inc.get_loc(900000) + self.idx_inc.get_loc(900_000) def time_get_loc_dec(self): - self.idx_dec.get_loc(100000) + self.idx_dec.get_loc(100_000) + + def time_iter_inc(self): + for _ in self.idx_inc: + pass + + def time_iter_dec(self): + for _ in self.idx_dec: + pass class IndexEquals: def setup(self): - idx_large_fast = RangeIndex(100000) + idx_large_fast = RangeIndex(100_000) idx_small_slow = date_range(start="1/1/2012", periods=1) self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) @@ -94,7 +102,7 @@ def time_non_object_equals_multiindex(self): class IndexAppend: def setup(self): - N = 10000 + N = 10_000 self.range_idx = RangeIndex(0, 100) self.int_idx = self.range_idx.astype(int) self.obj_idx = self.int_idx.astype(str) @@ -168,7 +176,7 @@ def time_get_loc_non_unique_sorted(self, dtype): class Float64IndexMethod: # GH 13166 def setup(self): - N = 100000 + N = 100_000 a = np.arange(N) self.ind = Float64Index(a * 4.8000000418824129e-08) @@ -212,7 +220,7 @@ class GC: params = [1, 2, 5] def create_use_drop(self): - idx = Index(list(range(1000 * 1000))) + idx = Index(list(range(1_000_000))) idx._engine def peakmem_gc_instances(self, N): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6080c32052266..c65c3d5ff3d9c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -373,6 +373,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def tolist(self): return list(self._range) + @doc(Int64Index.__iter__) + def __iter__(self): + yield from self._range + @doc(Int64Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index ef4bb9a0869b0..c4c242746e92c 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -167,6 +167,10 @@ def test_cache(self): idx.any() assert idx._cache == {} + for _ in idx: + pass + assert idx._cache == {} + df = pd.DataFrame({"a": range(10)}, index=idx) df.loc[50] From 542b20aef5789b3eb90d49d3624c78997761c011 Mon Sep 17 00:00:00 2001 From: SylvainLan Date: Thu, 13 Aug 2020 20:17:39 +0200 Subject: [PATCH 38/51] Refactor tables latex (#35649) --- pandas/io/formats/latex.py | 103 +++++++---------------- pandas/tests/io/formats/test_to_latex.py | 3 +- 2 files changed, 33 insertions(+), 73 deletions(-) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 5d6f0a08ef2b5..715b8bbdf5672 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -121,10 +121,7 @@ def pad_empties(x): else: column_format = self.column_format - if self.longtable: - self._write_longtable_begin(buf, column_format) - else: - self._write_tabular_begin(buf, column_format) + self._write_tabular_begin(buf, column_format) buf.write("\\toprule\n") @@ -190,10 +187,7 @@ def pad_empties(x): if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) - if self.longtable: - self._write_longtable_end(buf) - else: - self._write_tabular_end(buf) + self._write_tabular_end(buf) def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: r""" @@ -288,7 +282,7 @@ def _write_tabular_begin(self, buf, column_format: str): for 3 columns """ if self._table_float: - # then write output in a nested table/tabular environment + # then write output in a nested table/tabular or longtable environment if self.caption is None: caption_ = "" else: @@ -304,12 +298,27 @@ def _write_tabular_begin(self, buf, column_format: str): else: position_ = f"[{self.position}]" - buf.write(f"\\begin{{table}}{position_}\n\\centering{caption_}{label_}\n") + if self.longtable: + table_ = f"\\begin{{longtable}}{position_}{{{column_format}}}" + tabular_ = "\n" + else: + table_ = f"\\begin{{table}}{position_}\n\\centering" + tabular_ = f"\n\\begin{{tabular}}{{{column_format}}}\n" + + if self.longtable and (self.caption is not None or self.label is not None): + # a double-backslash is required at the end of the line + # as discussed here: + # https://tex.stackexchange.com/questions/219138 + backlash_ = "\\\\" + else: + backlash_ = "" + buf.write(f"{table_}{caption_}{label_}{backlash_}{tabular_}") else: - # then write output only in a tabular environment - pass - - buf.write(f"\\begin{{tabular}}{{{column_format}}}\n") + if self.longtable: + tabletype_ = "longtable" + else: + tabletype_ = "tabular" + buf.write(f"\\begin{{{tabletype_}}}{{{column_format}}}\n") def _write_tabular_end(self, buf): """ @@ -323,62 +332,12 @@ def _write_tabular_end(self, buf): a string. """ - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") - if self._table_float: - buf.write("\\end{table}\n") - else: - pass - - def _write_longtable_begin(self, buf, column_format: str): - """ - Write the beginning of a longtable environment including caption and - label if provided by user. - - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - column_format : str - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' - for 3 columns - """ - if self.caption is None: - caption_ = "" - else: - caption_ = f"\\caption{{{self.caption}}}" - - if self.label is None: - label_ = "" - else: - label_ = f"\\label{{{self.label}}}" - - if self.position is None: - position_ = "" + if self.longtable: + buf.write("\\end{longtable}\n") else: - position_ = f"[{self.position}]" - - buf.write( - f"\\begin{{longtable}}{position_}{{{column_format}}}\n{caption_}{label_}" - ) - if self.caption is not None or self.label is not None: - # a double-backslash is required at the end of the line - # as discussed here: - # https://tex.stackexchange.com/questions/219138 - buf.write("\\\\\n") - - @staticmethod - def _write_longtable_end(buf): - """ - Write the end of a longtable environment. - - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - - """ - buf.write("\\end{longtable}\n") + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") + if self._table_float: + buf.write("\\end{table}\n") + else: + pass diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 93ad3739e59c7..96a9ed2b86cf4 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -555,7 +555,8 @@ def test_to_latex_longtable_caption_label(self): result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label) expected_cl = r"""\begin{longtable}{lrl} -\caption{a table in a \texttt{longtable} environment}\label{tab:longtable}\\ +\caption{a table in a \texttt{longtable} environment} +\label{tab:longtable}\\ \toprule {} & a & b \\ \midrule From ed23eb894d2ce1c4e1ae2f530cb0394db0c4d8bc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 13 Aug 2020 12:18:47 -0700 Subject: [PATCH 39/51] CI: avoid file leaks in sas_xport tests (#35693) --- pandas/io/sas/sasreader.py | 10 ++++++++-- pandas/tests/io/sas/test_xport.py | 13 ++++++++++-- pandas/util/_test_decorators.py | 33 +++++++++++++++++++++---------- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 291c9d1ee7f0c..fffdebda8c87a 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -6,7 +6,7 @@ from pandas._typing import FilePathOrBuffer, Label -from pandas.io.common import stringify_path +from pandas.io.common import get_filepath_or_buffer, stringify_path if TYPE_CHECKING: from pandas import DataFrame # noqa: F401 @@ -109,6 +109,10 @@ def read_sas( else: raise ValueError("unable to infer format of SAS file") + filepath_or_buffer, _, _, should_close = get_filepath_or_buffer( + filepath_or_buffer, encoding + ) + reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader @@ -129,5 +133,7 @@ def read_sas( return reader data = reader.read() - reader.close() + + if should_close: + reader.close() return data diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 2682bafedb8f1..939edb3d8e0b4 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -26,10 +28,12 @@ def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") - self.file02b = open(os.path.join(self.dirpath, "SSHSV1_A.xpt"), "rb") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") + with td.file_leak_context(): + yield + def test1_basic(self): # Tests with DEMO_G.xpt (all numeric file) @@ -127,7 +131,12 @@ def test2_binary(self): data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) - data = read_sas(self.file02b, format="xport") + with open(self.file02, "rb") as fd: + with td.file_leak_context(): + # GH#35693 ensure that if we pass an open file, we + # dont incorrectly close it in read_sas + data = read_sas(fd, format="xport") + tm.assert_frame_equal(data, data_csv) def test_multiple_types(self): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index bdf633839b2cd..0dad8c7397e37 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,8 +23,8 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ +from contextlib import contextmanager from distutils.version import LooseVersion -from functools import wraps import locale from typing import Callable, Optional @@ -237,23 +237,36 @@ def documented_fixture(fixture): def check_file_leaks(func) -> Callable: """ - Decorate a test function tot check that we are not leaking file descriptors. + Decorate a test function to check that we are not leaking file descriptors. """ - psutil = safe_import("psutil") - if not psutil: + with file_leak_context(): return func - @wraps(func) - def new_func(*args, **kwargs): + +@contextmanager +def file_leak_context(): + """ + ContextManager analogue to check_file_leaks. + """ + psutil = safe_import("psutil") + if not psutil: + yield + else: proc = psutil.Process() flist = proc.open_files() + conns = proc.connections() - func(*args, **kwargs) + yield flist2 = proc.open_files() - assert flist2 == flist - - return new_func + # on some builds open_files includes file position, which we _dont_ + # expect to remain unchanged, so we need to compare excluding that + flist_ex = [(x.path, x.fd) for x in flist] + flist2_ex = [(x.path, x.fd) for x in flist2] + assert flist2_ex == flist_ex, (flist2, flist) + + conns2 = proc.connections() + assert conns2 == conns, (conns2, conns) def async_mark(): From 59febbd2554eee524ccf9938eac9491bbedb1e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 13 Aug 2020 18:04:49 -0400 Subject: [PATCH 40/51] BUG/ENH: consistent gzip compression arguments (#35645) --- doc/source/user_guide/io.rst | 11 +++++--- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_typing.py | 5 ++++ pandas/core/generic.py | 13 +++++++-- pandas/io/common.py | 31 +++++++++++---------- pandas/io/formats/csvs.py | 6 ++-- pandas/io/json/_json.py | 31 +++++++++++++++------ pandas/io/pickle.py | 8 +++--- pandas/io/stata.py | 19 ++++--------- pandas/tests/io/test_compression.py | 43 +++++++++++++++++++++++++++++ 10 files changed, 118 insertions(+), 50 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 35403b5c8b66f..43030d76d945a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -287,16 +287,19 @@ Quoting, compression, and file format compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', + bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` - set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to - compression settings. As an example, the following could be passed for - faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are + forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``. + As an example, the following could be passed for faster compression and to + create a reproducible gzip archive: + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. + .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to `gzip.open`. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index deb5697053ea8..6612f741d925d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,6 +235,7 @@ I/O - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) +- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 47a102ddc70e0..1b972030ef5a5 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -109,3 +109,8 @@ # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] + + +# compression keywords and compression +CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] +CompressionOptions = Optional[Union[str, CompressionDict]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 11147bffa32c3..2219d54477d9e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -35,6 +35,7 @@ from pandas._libs.tslibs import Tick, Timestamp, to_offset from pandas._typing import ( Axis, + CompressionOptions, FilePathOrBuffer, FrameOrSeries, JSONSerializable, @@ -2058,7 +2059,7 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool_t = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool_t = True, indent: Optional[int] = None, storage_options: StorageOptions = None, @@ -2646,7 +2647,7 @@ def to_sql( def to_pickle( self, path, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: @@ -3053,7 +3054,7 @@ def to_csv( index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, - compression: Optional[Union[str, Mapping[str, str]]] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, quotechar: str = '"', line_terminator: Optional[str] = None, @@ -3144,6 +3145,12 @@ def to_csv( Compression is supported for binary file objects. + .. versionchanged:: 1.2.0 + + Previous versions forwarded dict entries for 'gzip' to + `gzip.open` instead of `gzip.GzipFile` which prevented + setting `mtime`. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index 9ac642e58b544..54f35e689aac8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,7 +18,6 @@ Optional, Tuple, Type, - Union, ) from urllib.parse import ( urljoin, @@ -29,7 +28,12 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import ( + CompressionDict, + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency @@ -160,7 +164,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, - compression: Optional[str] = None, + compression: CompressionOptions = None, mode: Optional[str] = None, storage_options: StorageOptions = None, ): @@ -188,7 +192,7 @@ def get_filepath_or_buffer( Returns ------- - Tuple[FilePathOrBuffer, str, str, bool] + Tuple[FilePathOrBuffer, str, CompressionOptions, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ @@ -291,8 +295,8 @@ def file_path_to_url(path: str) -> str: def get_compression_method( - compression: Optional[Union[str, Mapping[str, Any]]] -) -> Tuple[Optional[str], Dict[str, Any]]: + compression: CompressionOptions, +) -> Tuple[Optional[str], CompressionDict]: """ Simplifies a compression argument to a compression method string and a mapping containing additional arguments. @@ -316,7 +320,7 @@ def get_compression_method( if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression_method = compression_args.pop("method") + compression_method = compression_args.pop("method") # type: ignore except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: @@ -383,7 +387,7 @@ def get_handle( path_or_buf, mode: str, encoding=None, - compression: Optional[Union[str, Mapping[str, Any]]] = None, + compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors=None, @@ -464,16 +468,13 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode, **compression_args) + f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": - if is_path: - f = bz2.BZ2File(path_or_buf, mode, **compression_args) - else: - f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) + f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": @@ -577,7 +578,9 @@ def __init__( if mode in ["wb", "rb"]: mode = mode.replace("b", "") self.archive_name = archive_name - super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} + kwargs_zip.update(kwargs) + super().__init__(file, mode, **kwargs_zip) def write(self, data): archive_name = self.filename diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6eceb94387171..c462a96da7133 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,13 +5,13 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Hashable, List, Mapping, Optional, Sequence, Union +from typing import Hashable, List, Optional, Sequence, Union import warnings import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -44,7 +44,7 @@ def __init__( mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0d2b351926343..c2bd6302940bb 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,13 +3,13 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type +from typing import IO, Any, Callable, List, Optional, Type import numpy as np import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import JSONSerializable, StorageOptions +from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -19,7 +19,12 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat -from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + infer_compression, +) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer @@ -41,7 +46,7 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool = True, indent: int = 0, storage_options: StorageOptions = None, @@ -369,7 +374,7 @@ def read_json( encoding=None, lines: bool = False, chunksize: Optional[int] = None, - compression="infer", + compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, ): @@ -607,7 +612,9 @@ def read_json( if encoding is None: encoding = "utf-8" - compression = infer_compression(path_or_buf, compression) + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(path_or_buf, compression_method) + compression = dict(compression, method=compression_method) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, @@ -667,10 +674,13 @@ def __init__( encoding, lines: bool, chunksize: Optional[int], - compression, + compression: CompressionOptions, nrows: Optional[int], ): + compression_method, compression = get_compression_method(compression) + compression = dict(compression, method=compression_method) + self.orient = orient self.typ = typ self.dtype = dtype @@ -687,6 +697,7 @@ def __init__( self.nrows_seen = 0 self.should_close = False self.nrows = nrows + self.file_handles: List[IO] = [] if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) @@ -735,8 +746,8 @@ def _get_data_from_filepath(self, filepath_or_buffer): except (TypeError, ValueError): pass - if exists or self.compression is not None: - data, _ = get_handle( + if exists or self.compression["method"] is not None: + data, self.file_handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, @@ -816,6 +827,8 @@ def close(self): self.open_stream.close() except (IOError, AttributeError): pass + for file_handle in self.file_handles: + file_handle.close() def __next__(self): if self.nrows: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index eee6ec7c9feca..fc1d2e385cf72 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,9 +1,9 @@ """ pickle compat """ import pickle -from typing import Any, Optional +from typing import Any import warnings -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -12,7 +12,7 @@ def to_pickle( obj: Any, filepath_or_buffer: FilePathOrBuffer, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): @@ -114,7 +114,7 @@ def to_pickle( def read_pickle( filepath_or_buffer: FilePathOrBuffer, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): """ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7a25617885839..ec3819f1673a8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1938,9 +1938,9 @@ def read_stata( def _open_file_binary_write( fname: FilePathOrBuffer, - compression: Union[str, Mapping[str, str], None], + compression: CompressionOptions, storage_options: StorageOptions = None, -) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: +) -> Tuple[BinaryIO, bool, CompressionOptions]: """ Open a binary file or no-op if file-like. @@ -1978,17 +1978,10 @@ def _open_file_binary_write( # Extract compression mode as given, if dict compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) - path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, - mode="wb", - compression=compression_typ, - storage_options=storage_options, + compression = dict(compression_args, method=compression_typ) + path_or_buf, _, compression, _ = get_filepath_or_buffer( + fname, mode="wb", compression=compression, storage_options=storage_options, ) - if compression_typ is not None: - compression = compression_args - compression["method"] = compression_typ - else: - compression = None f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False) return f, True, compression else: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 902a3d5d2a397..bc14b485f75e5 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,7 +1,10 @@ +import io import os +from pathlib import Path import subprocess import sys import textwrap +import time import pytest @@ -130,6 +133,46 @@ def test_compression_binary(compression_only): ) +def test_gzip_reproducibility_file_name(): + """ + Gzip should create reproducible archives with mtime. + + Note: Archives created with different filenames will still be different! + + GH 28103 + """ + df = tm.makeDataFrame() + compression_options = {"method": "gzip", "mtime": 1} + + # test for filename + with tm.ensure_clean() as path: + path = Path(path) + df.to_csv(path, compression=compression_options) + time.sleep(2) + output = path.read_bytes() + df.to_csv(path, compression=compression_options) + assert output == path.read_bytes() + + +def test_gzip_reproducibility_file_object(): + """ + Gzip should create reproducible archives with mtime. + + GH 28103 + """ + df = tm.makeDataFrame() + compression_options = {"method": "gzip", "mtime": 1} + + # test for file object + buffer = io.BytesIO() + df.to_csv(buffer, compression=compression_options, mode="wb") + output = buffer.getvalue() + time.sleep(2) + buffer = io.BytesIO() + df.to_csv(buffer, compression=compression_options, mode="wb") + assert output == buffer.getvalue() + + def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 From 6ea8474f502a14a97fd1407e9a073228abe36c7e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 14 Aug 2020 02:50:51 +0100 Subject: [PATCH 41/51] REGR: Dataframe.reset_index() on empty DataFrame with MI and datatime level (#35673) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/frame.py | 2 +- .../tests/frame/methods/test_reset_index.py | 30 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index b37103910afab..98d67e930ccc0 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 547d86f221b5f..1587dd8798ec3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4816,7 +4816,7 @@ def _maybe_casted_values(index, labels=None): # we can have situations where the whole mask is -1, # meaning there is nothing found in labels, so make all nan's - if mask.all(): + if mask.size > 0 and mask.all(): dtype = index.dtype fill_value = na_value_for_dtype(dtype) values = construct_1d_arraylike_from_scalar( diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index da4bfa9be4881..b88ef0e6691cb 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -318,3 +318,33 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): result = DataFrame(index=idx)[:0].reset_index().dtypes expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex(): + # https://github.com/pandas-dev/pandas/issues/35606 + idx = MultiIndex( + levels=[[pd.Timestamp("2020-07-20 00:00:00")], [3, 4]], + codes=[[], []], + names=["a", "b"], + ) + df = DataFrame(index=idx, columns=["c", "d"]) + result = df.reset_index() + expected = DataFrame( + columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1) + ) + expected["a"] = expected["a"].astype("datetime64[ns]") + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): + # https://github.com/pandas-dev/pandas/issues/35657 + df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01"))) + df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() + result = df.reset_index() + expected = DataFrame( + columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1) + ) + expected["c3"] = expected["c3"].astype("datetime64[ns]") + expected["c1"] = expected["c1"].astype("float64") + tm.assert_frame_equal(result, expected) From faa6e36e482ec5f00e6502c1e5d4fdf4f2f08009 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 14 Aug 2020 13:32:00 +0100 Subject: [PATCH 42/51] CLN: remove extant uses of built-in filter function (#35717) --- pandas/_config/localization.py | 14 ++++++++------ pandas/core/computation/expr.py | 7 +++---- pandas/core/reshape/merge.py | 7 +++++-- pandas/io/json/_json.py | 5 +++-- pandas/io/parsers.py | 4 +--- pandas/io/pytables.py | 20 +++++++++----------- pandas/tests/computation/test_eval.py | 26 +++++++++++++++----------- 7 files changed, 44 insertions(+), 39 deletions(-) diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 66865e1afb952..3933c8f3d519c 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -88,12 +88,14 @@ def _valid_locales(locales, normalize): valid_locales : list A list of valid locales. """ - if normalize: - normalizer = lambda x: locale.normalize(x.strip()) - else: - normalizer = lambda x: x.strip() - - return list(filter(can_set_locale, map(normalizer, locales))) + return [ + loc + for loc in ( + locale.normalize(loc.strip()) if normalize else loc.strip() + for loc in locales + ) + if can_set_locale(loc) + ] def _default_locale_getter(): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index fcccc24ed7615..125ecb0d88036 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -167,10 +167,9 @@ def _is_type(t): # partition all AST nodes _all_nodes = frozenset( - filter( - lambda x: isinstance(x, type) and issubclass(x, ast.AST), - (getattr(ast, node) for node in dir(ast)), - ) + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2349cb1dcc0c7..01e20f49917ac 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2012,8 +2012,11 @@ def _sort_labels(uniques: np.ndarray, left, right): def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow - pred = lambda i: not is_int64_overflow_possible(shape[:i]) - nlev = next(filter(pred, range(len(shape), 0, -1))) + nlev = next( + lev + for lev in range(len(shape), 0, -1) + if not is_int64_overflow_possible(shape[:lev]) + ) # get keys for the first `nlev` levels stride = np.prod(shape[1:nlev], dtype="i8") diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c2bd6302940bb..fe5e172655ae1 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -765,8 +765,9 @@ def _combine_lines(self, lines) -> str: """ Combines a list of JSON objects into one JSON object. """ - lines = filter(None, map(lambda x: x.strip(), lines)) - return "[" + ",".join(lines) + "]" + return ( + f'[{",".join((line for line in (line.strip() for line in lines) if line))}]' + ) def read(self): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9dc0e1f71d13b..5d49757ce7d58 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2161,9 +2161,7 @@ def read(self, nrows=None): if self.usecols is not None: columns = self._filter_usecols(columns) - col_dict = dict( - filter(lambda item: item[0] in columns, col_dict.items()) - ) + col_dict = {k: v for k, v in col_dict.items() if k in columns} return index, columns, col_dict diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2abc570a04de3..f08e0514a68e1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -99,22 +99,20 @@ def _ensure_str(name): def _ensure_term(where, scope_level: int): """ - ensure that the where is a Term or a list of Term - this makes sure that we are capturing the scope of variables - that are passed - create the terms here with a frame_level=2 (we are 2 levels down) + Ensure that the where is a Term or a list of Term. + + This makes sure that we are capturing the scope of variables that are + passed create the terms here with a frame_level=2 (we are 2 levels down) """ # only consider list/tuple here as an ndarray is automatically a coordinate # list level = scope_level + 1 if isinstance(where, (list, tuple)): - wlist = [] - for w in filter(lambda x: x is not None, where): - if not maybe_expression(w): - wlist.append(w) - else: - wlist.append(Term(w, scope_level=level)) - where = wlist + where = [ + Term(term, scope_level=level + 1) if maybe_expression(term) else term + for term in where + if term is not None + ] elif maybe_expression(where): where = Term(where, scope_level=level) return where if where is None or len(where) else None diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 08d8d5ca342b7..853ab00853d1b 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -168,7 +168,7 @@ def setup_ops(self): def setup_method(self, method): self.setup_ops() self.setup_data() - self.current_engines = filter(lambda x: x != self.engine, _engines) + self.current_engines = (engine for engine in _engines if engine != self.engine) def teardown_method(self, method): del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses @@ -774,11 +774,9 @@ def setup_class(cls): cls.parser = "python" def setup_ops(self): - self.cmp_ops = list( - filter(lambda x: x not in ("in", "not in"), expr._cmp_ops_syms) - ) + self.cmp_ops = [op for op in expr._cmp_ops_syms if op not in ("in", "not in")] self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = [s for s in expr._bool_ops_syms if s not in ("and", "or")] + self.bin_ops = [op for op in expr._bool_ops_syms if op not in ("and", "or")] self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops self.unary_ops = "+", "-", "~" @@ -1150,9 +1148,9 @@ def eval(self, *args, **kwargs): return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): - ops = self.arith_ops + ops = (op for op in self.arith_ops if op != "//") - for op in filter(lambda x: x != "//", ops): + for op in ops: ex = f"1 {op} 1" ex2 = f"x {op} 1" ex3 = f"1 {op} (x + 1)" @@ -1637,8 +1635,11 @@ def setup_class(cls): super().setup_class() cls.engine = "numexpr" cls.parser = "python" - cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) + cls.arith_ops = [ + op + for op in expr._arith_ops_syms + expr._cmp_ops_syms + if op not in ("in", "not in") + ] def test_check_many_exprs(self): a = 1 # noqa @@ -1726,8 +1727,11 @@ class TestOperationsPythonPython(TestOperationsNumExprPython): def setup_class(cls): super().setup_class() cls.engine = cls.parser = "python" - cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) + cls.arith_ops = [ + op + for op in expr._arith_ops_syms + expr._cmp_ops_syms + if op not in ("in", "not in") + ] class TestOperationsPythonPandas(TestOperationsNumExprPandas): From ba3400d591fd00c41ff7e7f0f91032e72498316c Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Fri, 14 Aug 2020 14:36:09 +0200 Subject: [PATCH 43/51] BUG: Styler cell_ids fails on multiple renders (#35664) --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/io/formats/style.py | 14 +++++++------- pandas/tests/io/formats/test_style.py | 5 ++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 98d67e930ccc0..3f177b29d52b8 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -33,7 +33,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). Categorical ^^^^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 584f42a6cab12..3bbb5271bce61 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -390,16 +390,16 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style + props = [] if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) + for x in ctx[r, c]: + # have to handle empty styles like [''] + if x.count(":"): + props.append(tuple(x.split(":"))) + else: + props.append(("", "")) row_es.append(row_dict) - props = [] - for x in ctx[r, c]: - # have to handle empty styles like [''] - if x.count(":"): - props.append(tuple(x.split(":"))) - else: - props.append(("", "")) cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 3ef5157655e78..6025649e9dbec 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1684,8 +1684,11 @@ def f(a, b, styler): def test_no_cell_ids(self): # GH 35588 + # GH 35663 df = pd.DataFrame(data=[[0]]) - s = Styler(df, uuid="_", cell_ids=False).render() + styler = Styler(df, uuid="_", cell_ids=False) + styler.render() + s = styler.render() # render twice to ensure ctx is not updated assert s.find('') != -1 From 21eb4e6febd6ce0408ae2ac5e8c02056d4fb9f93 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 14 Aug 2020 16:35:21 +0200 Subject: [PATCH 44/51] REGR: fix DataFrame.diff with read-only data (#35707) Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.1.1.rst | 3 ++- pandas/_libs/algos.pyx | 7 ++++--- pandas/tests/frame/methods/test_diff.py | 9 +++++++++ setup.py | 3 +++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 3f177b29d52b8..85e2a335c55c6 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -16,10 +16,11 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) -- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7e90a8cc681ef..0a70afda893cf 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1200,14 +1200,15 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - diff_t[:, :] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython 3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.is_f_contig() + bint f_contig = arr.flags.f_contiguous + # bint f_contig = arr.is_f_contig() # TODO(cython 3) # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 45f134a93a23a..0486fb2d588b6 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -214,3 +214,12 @@ def test_diff_integer_na(self, axis, expected): # Test case for default behaviour of diff result = df.diff(axis=axis) tm.assert_frame_equal(result, expected) + + def test_diff_readonly(self): + # https://github.com/pandas-dev/pandas/issues/35559 + arr = np.random.randn(5, 2) + arr.flags.writeable = False + df = pd.DataFrame(arr) + result = df.diff() + expected = pd.DataFrame(np.array(df)).diff() + tm.assert_frame_equal(result, expected) diff --git a/setup.py b/setup.py index 43d19d525876b..f6f0cd9aabc0e 100755 --- a/setup.py +++ b/setup.py @@ -456,6 +456,9 @@ def run(self): if sys.version_info[:2] == (3, 8): # GH 33239 extra_compile_args.append("-Wno-error=deprecated-declarations") + # https://github.com/pandas-dev/pandas/issues/35559 + extra_compile_args.append("-Wno-error=unreachable-code") + # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled From 39894938dc6d20487f59c3706e0b46b0fb081296 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Sat, 15 Aug 2020 00:17:37 +0800 Subject: [PATCH 45/51] [BUG] fixed DateOffset pickle bug when months >= 12 (#35258) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 7 - .../1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle | Bin 0 -> 127216 bytes .../tests/io/generate_legacy_storage_files.py | 8 +- .../offsets/data/dateoffset_0_15_2.pickle | 183 ------------------ pandas/tests/tseries/offsets/test_offsets.py | 25 +-- 6 files changed, 14 insertions(+), 210 deletions(-) create mode 100644 pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle delete mode 100644 pandas/tests/tseries/offsets/data/dateoffset_0_15_2.pickle diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6612f741d925d..a3bb6dfd86bd2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -172,6 +172,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) +- Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) - Timedelta diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index ac2725fc58aee..7f0314d737619 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -989,13 +989,6 @@ cdef class RelativeDeltaOffset(BaseOffset): state["_offset"] = state.pop("offset") state["kwds"]["offset"] = state["_offset"] - if "_offset" in state and not isinstance(state["_offset"], timedelta): - # relativedelta, we need to populate using its kwds - offset = state["_offset"] - odict = offset.__dict__ - kwds = {key: odict[key] for key in odict if odict[key]} - state.update(kwds) - self.n = state.pop("n") self.normalize = state.pop("normalize") self._cache = state.pop("_cache", {}) diff --git a/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle b/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f8df9afff65658d0f58a5b2afb30997cea53e4c8 GIT binary patch literal 127216 zcmdqq1z6n7qp0y@0u-oHcNc2Z-AkQ9-G!oEwpfw6JMGq~ySux)ySux)+nwybTd4Q@ z&i|Zy&z(NqAK7F>GM>qV?O`+On2%x(U89{%cHw?u0e<1p`p>rhfgxTI(Wb;V{aS~9N`xh>=hX9666)^)!HvCT<)!2 zKyd3ozlwn&K5g7B??#(<;@l$Il=9n8`}%e83wQDM3ipdPIeG^BwfC$ZEZ>vcn^OI3 z%P_Ct*7~M(b*-D~r(zTA733G~VX_U6@Cu8Fc2i6?;SnLB(QdZ-4}S6oHL_oH-Dod) z49S8cgF-u6&Ne7AFe1P!EX=E8v?-ovn4eEbaCk&mq)&u?2)5=8qfK#S?3gzhDR1YoRj~ZWOFu%Ad5z*dJ zuFjU^u)aPm$4})g+T+{q(Q-K_GD4#b2~2k0di;k+w=lnL*Ul?2QeTnXWG_?EQ?8RKW$zssAWuJ7{%9K>7~rFy zft%uH>!#`_5a|^eE`Ptt##5eLv_UmFcn0}Jc=>uoc*(sQl6#qKy&@uHig%R@9+BQA zP3DGZgPkS27#s}oV@GLDlL&|vk%qj#jYgIDv-17&Uqz## zs2;_|4eo{#<~TEyHqilM6Y#BGBDI{5ki=T5y=tW!6sUB1gh z<@7w%BHFpC^>p3TuZ7YxFu*(9B|;{qPC$^KEDk}TGJPHDTNh*+542a5A%_o+@8jg* zp~t#O^A8D$u1dC3{Qf4_S)(g3Cc47^GP*>B%!of^6H~gM6?mE3{LJSe>lJhTP>DCJ zh2JpJ6#x78%oTz;pPJIf-l@3|hr7hu=hu%`pnOx;*?O8k{$u8JlsQ#qPWPA;viTu% z%3tp)WTBGPgZvi}-A~VdQLeQWW%u3lcBc|6O0rHN4Rn3u+RW5lF>&m3UEz|q!f9!p zE!&3bPOaVWvhI^>_w$^QmJRRTS$D8a(*0)=>$^4Cdisa?wT(8V;Y7d50U`eW;eIl< zOKay0)CS34NgCA>dY;ZZ0g+g7Rs92^_*=N=-19KP_NmY zep&8pr86o_d|4IO^g z2FM(ys;YVKKii2>ktY@9YAaENzLlTJAv`23A|%Y$FHG+)=-md(F+4E5^80np*AFPq zU$@+e%Im+&(sEzYW9*7Kv8=Io$B^9c;JX-oCf|Eye$UVF_E&N8^>qH>NgEwyGrit@j! z{9h^BSV|V(%3gZHns}YsuP$ zH>~`&sxdbLvOUy0ud=z(x0k$JkguJh4LxEf zrzwuP_{knqlK7$%S3 zl+fCTtKamytY;G$93Zbz^$wL`mB}WwqdWnV@C=pR8E2DYgL?TYd&vvDu;Az#Q`dj_ zA}{>3n(p;wbjk}D@1N(FSmzcKr>nn-(+z(gr*@&TsSWw(y?;~cpY_<-nJKlWXK2R` zo{_PlgzNh6aLNE!|EZFCT-SykN9m zF&jV0hz^hJYi%t5BJ*GWCZQhxbwWkT;f^qUkU)?Pu<3$g73e;n!mXmaTrZK2dqg z`dDKL|AmA0iYYHCWuHH$!ONyESsFa?UHxiu&@ZJdF>I(T&sknJ%d3v+#>Db7bn_jI z$t??VxZuASt+js>tv|bTu|~4YN8beNvOF$x_|@{)iu*qkt>yKIv<^TR-SwrPN2{@< zzOJ&dl#V4z4gWGqZGOzDF`b@o^6an2p=7N8q#G~C$1J(|AB}kZ*b}hqN3W!;X=F`~ zUoD#}sUNNgEc>yv%MYf$8<49ov8L zF}+h9`(u7FHG@A{*jcY=OB>t83>-q{#0>08Dd@1{avZ=CS%LkUA|aKlWd6s!sOU0mw><9$Jg$( z;ep9E!Vu=?7cH+-&ACG#%9q2{zFbfL$F3TMBFvp_zi+P^jhAHX8}G^&%VqL^ro>yW zl>bSImj_@D-;XrtCG~&4l*P*T`}N!@t8ViBXMNF9cYmAjZRC-gSH$G|U*}?DF+J=2 zyQcDM-ihgf{BQ~To56V*nLnnBW~tE!xLSLOq{g56A4$twHcW|WF(YQite730F&E~E z8Cm(OV)L6*kJXvUsVB+TR=0#Tft0V+jOA9y&#D=derfC}hsbjyWDzi$^q0Z<%Mkr#sCZ}n=f{y+U~{=fZWHn-&eht?I} z7mVlD72kKQ-&$7)H#;|B?xp{x#Ke>1h?YVV(JB1j7MhsT{=3@syHfI3W0Kaf$lq(e z$Xo0o2eo#~xVN;Z#(jE~zhAx>RvC{;^wGq>8VKDw$TQU&Es636|M*J}$zSSh(p)Ro zk9JR#Kp)uDf3yy4{?F(ASoyH1yull*t6NE5{8?Am!@43SAO0p+#kc0Ff^v}hUuUbB zHt=heH0f&ahHTg_y`{B?#_K5d=#NoMFj_{R)=KxSw;o2Ws3S2;4A(=tB) zKc9{NvgZHI07tiPbHB-&$zwVjmRfkUbwy0>r+Z>u@i*fba@^W{Ti-lxU0vqfU)_S& zr>6XDgi|)6<{)c$bCAE>9RA7V4C|d(lZ_nAl|vB!&rZ}ZK9Z+r9#=75*6VJkueVvg znIQ4qy^?S4R2%Qf4;mlJmp|S*u|~NKacqlr7zgdq0Ua?e#>4n%E=ntnZ}c-*VzPG+ z363y)y`iGt5!Y|0Si9zm{E$(|)HL5mkttEQ$;lv*F>|i8ALd-i>(E%oE#u7` zh)(4{lJy<*d1$_VfpYqz%vRsrfj1XWIq&A{7V>U-0)57vdGl{_*SFK$xOMmQ&sp!_ zaf$zvTX)1vP)yAHli6rQQtjp`XXp_&b#4ACdPF@I z&JoA;30wL5&5w*L?Qwm3%J|ZwWiM<`+PnI0(flX-J%oEt04 z-)4&c{oAr0HrWLF`O91V|CcLN+pvJvhQB(D|5T=Slxe1i*oD$szek$lgv^DwsgqraMLW&d@Q zOP;l=3ZZG{`1)ZuEcS+Sl9~Qb-|!EE=izc1TBv8tRD6T>7jrTaek~SqW|=8N>>JqB zBjKSk<@{XCOL7iSJ@ewX)rcv+r)4Woa}kO*CHZC=f@PhYIB34*Z7oG^34b~cXr7{1 zIMx)spH9=(>kcKJHN*W7fiiiF3uPgVa<$i|hw4?f9)ae0$qB5-YMs9R!>1iAEmk%8 zsaWBkjz-6fGMf{5nunc=S|#$#Qv>@VlGqv1g*2;&_vxWKBEdC;91bI&BGMe9HTya&-Sl`h68% z$@`$`m$n$D8&u)>W!;U(`{jKtjq97Hi|*SL=UTs!R-G5LT6c6wVf$8k>%i!Gx`kWh zp01n8>E7UX($MK$FY0b7lsNZ$X@Xs|JLa`A_(_rE9ojaT)AQKO8vq(4d21hQXZNz|A! zb4qR+Q)V9MOMg*j9K0jKzm}RvlfA5!t+Rv-+P@qPj%m5R9f2@UBK>9<;%`z!@8ii5 zDf{IA*%E24=@$PfrVCkPTE9^Lq5RB{v23!JovW`6h2xJoKF-$)KCRxp)Vr4QV~XCj z)GK_wdC$WZ6idqEbo>2T>BpmY&h)11 zfB9L-+KGxUtM!cj3u&g;b#g?a+JCC&`m;1_Y_99v%%fR;E;wVt+ZB_Wcc!SYycOe))g@~JAOU-9B%ZL*}!~VD6c7t|5t9V>s6IyZkXyNA87EF|K+1Ns;~Sn zyDqA~{NI{u7|Zhwf4JkWSD<Gs}epeqV zZyZ&ze1@1D{GTeS){9Jipg<3L7fKKMfPo(5fPpzA#6&1Ng61F}ax(`x>TC{4F&QSu z6lgBT^5;}6r^Ymx7SmyRw4QoKmNQ{y%z{}l8)nBG=!`ir7v{!1m=|3zA6hGt0xTEA zLRc7!U{NfF)^@Q3%O$ZCmc}wz7QcJ+SgFAK710$dVP&j>*4wkz3$W_EUjyB+Cc2{s z)7wchtY=8~15jMsq*c6*#b8LYv(Gy#t7kZ-)`l28DV{0^E02;9k24WBf zV+e*~TMWZ+jKD~2hwZTgcEnED8M|Ot?1tU32lm8X*c*d2I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ z1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T z1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN z1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ew+7clZ?Rkge!=k}R{hBj!6b{Gfk(E;@b zPW7Naa;k@T7#|a0LbN`7?8I_nOoB-<879XRm=aTAYD|M^F&(DI444r!VP?#N=F1^@ z9@$vVjycd7b7C&cjd?IHx?n!cj|H$G7Q(_<1dF0RJy;LLu>_Vx{kc>!}C*0aZP;Mg5^j zJ%pn^yH^j9*bduc2keNQurqeSuBhKr(S!a_f*yKcPwa)gu@Cmee%K!eU=$9-L1@Ck zI0T2{FdU8}a3n_KC>)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksS zT#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6J zJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHb ze2h=_xJ%n;wSu!U(h^wDeD1Mn5zRtpnUS)9RAZYToSO~ zgqR4OP=De@4@od7CPV#sQ9YzU{c%)1q{7sg2Ge3XOph5bBWA+Pm<6+<{-CQKvSSW( z#+;Z7b7LONi!PWC^J4)lh=s5)7Qv!e42xq4)SnL3gZ_A^9!g^wEQ{r^JXXMp=!%uF zGFHK=SPiRV4Rphrs6WxF2M?@;wXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z^u$)^h2H3c zzUYVgLymfAjRp)rBeuan48mXx!BA|AVHl1P7>W8*m3nB89k3&I!p^8aY^jH?*bTd5 z5A2D(us8O>zSs}@;{c4pfj9_FI2ecEP#lKCaRiRUXdH#3aSV>daX20);6$8+lW_`8 z#c4PlXW&eng|l%E&c%5+9~a<4T!f2p2`Lkg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2Db9f#v;6=QIm+=Z- z#cOySZ{SV5g}3nz-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJkclaJZ;79y~pYaP? zr%0-rrJfL|q5jaS9&FJLzU{Xwm$uR|5|L|BUmQ!OI zG~arb$DfYn^q2uNVkXRtSuiVR!|a#?ol$?1R1di@H|D{-=z{q$KNi4(SO^Pa5iE+u zusG_^uIiyAmcr6l2FqeOERPkiBD!KFtc>QFqVo4tWw{zw#~SE{HPIbCuol+FI#?I= zhd}jE9~)ppY=n)m2{y%M*c@A6OZ3E6s6S?^2XFL2U-UzNY>fsCKqI!nKn%iQ48c%r zi(wd!5vV_hsfTvh9y?%1?1Y`M3wFhB*d2RdPwa)gu@Cmee%K!eU=$9-L1@CkI0T2{ zFdU8}a3n_KC>)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_ zDK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1 zC?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd##IuY)HXet$dPa@$@1 zgFmtxz(KvchR~4aXa@v%woLm$~PM2~}9F@58hI)K?6Fs4lNO4jUD@l~3N-`z6 zl0r!-Z-J*)(kN+_bV_<9gOX9nq-0jIC|Q+kN_HiO;;iIUaw)l$JW5{0Maie+R|+Tv zl|o8krHE2gDW()xN+>0jQc7v1j8ax9r<7MJC>0e~rIJ!vsUn}ot)^60YRKu9H5GTo zL#d_IR_Z8qm3m5jrGe5=X{0n(nkY?`W=eCVh0;>-R9Y!sinrpU_$q#iztUPUC;^I5 zX`=)xK}xU^qJ%1Kl`th-iBKYyc1nAtgVIsyq;yugC|#9qN_VA)(o^ZB^j7*PeU*Mn ze`SCYr3_RCDJEsGGDI1w3{!?HBb1R!v@%K=t&CB|D&v&#$^>PiGD(@NOi`vP)0FAT z3}vP=OPQ_AQRXW1l=;d6WudZ2S*$EkmMY7X<;n_WrLsy{t*lYjD(jT>$_8blvPs#j zY*Dr<+m!9f4rQmZOWCdLQT8hPl>N#9<)Cs%IjkH}jw;8L5p9<)QLOd8|B9o+{6j=gJG^rSeL7 zt-Mj*D({r{$_M46@=5uu$lEjW&zlNWQ*BgR)lQA0+N%z#qZ(I@r^Z(ks0r0Xs*{>n zO`;}Mlc~wo6lzK}m6}>jqo!5Usp-`WYDP7anpw@FW>vGP+0`7Xvzk-QrRG-isCiWv zHJ_SaEua=u3#o6h)b?rzwWHce?W}fDyQHsxL9jFdcP3mBEh&ogqrVdv}s3X;Ab(A_<9ixs_$EoAh3F<_3k~&$P zqE1z(sngXN>P&T(I$NEi&Q<5B^VJ3FLUob4SY4tnRhOyD)fMVWb(OkWU8Am5*Qx8( z4eCa9le$^mqHa~UsoT{Z>P~f+x?A0&?p61x`_%*LLG_S&SUsX1RgbC1)f4JT^^|&A zJ)@pg&#C9t3+hGnl6qOaqFz<6sn^vT>P_{QdRx7t-c|3Z_tgjLL-mpRSbd^CRiCNP z)fehZ^_BWseWSir->L7_59&wtllobe!*fDDd>}MoBW#78h$HNUgK!jaMLZE-BoGNj zBH<(wizFhcNG6ht6e6WaB~pttBCSX#(u)itqsSyOi!36m$R@Ij9Ku=T6uCrhkw@ee zE+U`EFA9i)qL3&oiio13m?$nvh?1g|C@so}vZ9a%Ra6tzMGfI5 zY6^GZA!>=*qK>F5>WTWIfoLciiN>OdXeyeC=AwmYDLh3h;U&CVPd!#Ax4U5F-nXUW5if7PK*~5#6&SkOcqnbR549V7c<06F-y!A zbHrRRPs|q!#6q!1EEY?|Qn5@d7c0a{u}Z8KYs6ZyPOKLj#7416Y!+L@R5#6fXL92Q5!QE^Nh7bnC?aY~#PXT(`?PMjAP#6@vQTozZvRdG#R z7dOOBaZB75cf?(BPuv#|#6$5&JQh#HQ}IkZ7caz1@k+cFZ^T>iPP`W%#7FT-d=~QZ zOj9-a@QG%l*=lxL9L-*H&>Xe6T0AYjmOx9WCDNR<#99(9sg_JjuBFgYYN@o;S{f~_ zmQG8rWzaHenY7GW7A>omP0Oz3(44iLS}rZOmPgC0xoG*c{8|C6pjJpLtQFCUYQ?nT zS_!SBR!S?amC?#-<+Soz1+Ajys#Ve|YgM$WS~ab@Rzq{sYHIG9hgM6gt<}-$YW1}G zS_7@2)<|otHPM=C&9vrP3$3N*skPF)G;hsE^VR$`f33A<&;m50)n<2d$&lN$ae2(Yk8gwC-9Dt*6#Y>#gS7{Mrb3oXl;}>S{tK{)y8S#wF%lpZIU)wo1#tCrfJi)8QM&3mNr|Pqs`Uk zY4f!O+CpuSwpd%DE!CE3%e58SN^O<4T3e&7)z)e2wGG-vZIiZH+oEmNwrSh79okN9 zm$qBmqwUr9Y5TPU+ClA*c33;29o3F$$F&pMN$r$&T05hi)y`?>wF}xs?UHs`yP{py zu4&h`8`@3nmUdgaV}6Q=se6SA{_ClGiE>z`?)4~c9luv(Kfw~dyP`KwxYpO$vfd8O zoeTN1JWJh5EG#jCdMR~6q8|cOo1se6{f~Cm=@Dvddz?sF%xFSESMFu zVRp=c&X^N(VQ$QWdC>*)VSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi5K6)Rz7 ztb$ds8dk>|=!P}X9X+rX*2X$m7wchtY=8~15jMsq*c6*#b8LYv(Gy#t7kZ-)`l28D zV{0^E02;9k24WBfV+e*~TMWZ+jKD~2hwZTgcEnED8M|Ot?1tU32lm8X*c*d2I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n z=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB z_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU z@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewc4Zo^d-Vmsv4cej| z#zA{@Ku3&=@i0Cnz=W6xoiH&b!K9cBlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI z4s^zxm0#-yj ztPAsC8nF$}{o0wb{9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=j zi}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}?{-2h2f5fg0MNE!trm zv_}VY#JCs_<6{C$h>6e%6JrugipelJrofb#3R7bmOpEC-J!Zg+m;O(V-YNh#jrS*z>-)BOJf-7)R z4Xa}fbiLgWIkv!-=!vb+3%$_?ebEp7u{9bn z0FBrN12G7LF$6=gErww@Mqnhi!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~^z? zAPzzk4#puk6o=t(9DyS-8b{%19D`$V9FE5cI1wk|WSoLiaT-p?88{PX;cT3Pb8#Nd z#|5|$7vW-Df=h83F2@zP5?A4BT!U+I9j?a>xDhwuX54~XaT{*O9k>&B;cnc6dvPD` z#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H z#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(V4a)71%M}ZpJpe@>A9JEIV zbi}wA594D3Oo)ll2@_)yOp3`cIi|prm{ z5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7U)*Jk)jinzTP(vHEMLUdx_UM3) z7#HJVd`y4|F%ddpVoZWbF&QSu6qpiIVQNf+X)zt9#|)SeGht@Tf>|*eX2%@pj5#qE z=Egjj7hNzP=Enk95DQ^pEQ0!%C+MLV7RM4;5=&ueEQ4jS9G1rlSP@-O|8@pFRK_Y; z6{}%&tbuM=6Wvk&QU^WM!rE8|>ta2uj}5RPHp0f(1e>D%9R+%5jxDeydSWZ|LT~gz zU-UzNY>fsCKqI!nKn%iQ48c%ri(wd!5g3W>uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$ zi~X=a4!|fJh=b6CgK-EB#bG!cN8m_|#!)yL$KY5ThvRVqPQ*z#8K>Y>oQBhJ2F}D; zI2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjO zxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8NZ-)VxMYjsW0`f zg3yD8HfW1>7zgdq0Ua?e#>4oS025*&bi%}#1e0PiOpYlqC8omEm$c`z@!U_Q)`1yKL413eVN!dL{0Vlga^C9oux!qTXJZ-O4m zVmU026|f?@VkNAMRj?{n!|GTA-LNLQqX*W)+E@qcVm+*n4X`0L!p7JHn_@F;jxDey zdSWZ|LT~gzU-UzNY>fsCKqI!nKn%iQ48c%ri(wd!5g3W>uswFbj@Su1V;Ag--LO0M zz@FF(dt)E$i~X=a4!|fJh=b6CgK-EB#bG!cN8m_|#!)yL$KY5ThvRVqPQ*z#8K>Y> zoQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8 z+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8NZ+I13F?{jEC`2|Kt05NQjBh2@_)yOp3`cIi|prm3lsD#v)i0^}lqj zhvHZQ^}jf;hf-J?%V1e7hvl&XRzz2f*q9c!Te2exDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`vf=BTf z9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zsf=}@o zKF1gM5?|qKe1mWC9lpm8_z^$hXZ(WZZ&8u;kQ&Ea9Vr4ev_V_6!#HUE+AaA}2bLW% zF2=+7m;e)EB6PyUm;{qzGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;Pjycd7b7C&c zjd?IHx?n!cj|H$G7Q(_<1dC!ZERH3xB$mR`SO&{tIV_JAup-KDi!g^uSQ)EeRjh{9 zu?D(fO>{>OtcA6)4%WqbSRWf;Lu`bNu?aTCX4o8CU`zDGR_KM^=!3rKhyK_a4H$q% zY=eOqguxhsq1YC~FdQQ=65C;W?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNeQ8*9> zp$P}$5FCoba5#>@kr<7ma5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|o zT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8% zJcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjx ze29B2 z+?WURq6?b86j?@jewGVhK`exYu?QB$VptqYU`Z^6rLhc_#d264D_})*#Y$Kit6){E zhSjkKx?xRpM-QxpwXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z^u$)^h2H3czUYVk*cuHO zfJSVCff$6r7=oeL7Q-+cBQO%%VSDU=9kCO3#xB?uyJ2_ifjzMo_QpQg7yDs<9Dq?c z5C@?N2jdVNioNOhq7M-6Sz7VR(& z+M@$HVqA=e@i74=#6;+Xi7^Q##blTqQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^m>qMV zGv>rxm>ct8UUb2Hm>&yZK`exYu?QB$VptqYU`Z^6rLhc_#d264D_})*#Y$Kit6){E zhSjkKx?xRpM-QxpwXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z^u$)^h2H3czUYVk*cuHO zfJSVCff$6r7=oeL7Q-+cBQO%%VSDU=9kCO3#xB?uyJ2_ifjzMo_QpQg7yDs<9Dq?c z5C@?N2jdVNioA9JEIV zbi}wA594D3Oo)ll2@_)yOp3`cIi|prm{ z5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7U^2-1$?LP|C&<1VM4&$IbI-n!Q z#dsJW6JSD2gie?klVDOzhRHDnro>d38q;7}Oo!<)17^fbm>IKRR?LRkF$X$hPRxb5 zF%RZN7tDwGu>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XRzz2f*q9c!Q) z)tJ21hxM@mHpE8Q7@J^IY=+IT1-3*_Y=vIvjXvm$e&~;_(SQMH#5Nd+ zK^Tl77>aE%48t)3Be5N}#}3#LJ7H(+f?cs2cE=vr6MJEA?1O!=ANI!q7=;6I5Snl> z4#A-~42Rcz=gO7 z7vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q z591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!w< zALA2ziqG&lzQC9G3SZ+Je2ee!J$}HC_z6Gb7nENGWNH6VpoTVRi*^_X?a=`pF)qf# z_?Q3_Vj^_H#Fzw=VlqsQDKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JzG8FOMT%#C?4 zFS=kp%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f?@VkNAMRj?{n!|GTA-LNLQ zqX*W)+E@qcVm+*n4X`0L!p7JHn_@F;jxDeydSWZ|LT~gzU-UzNY>fsCKqI!nKn%iQ z48c%ri(wd!5g3W>uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$i~X=a4!|fJh=b6CgK-EB z#bG!cN8m_|#!)yL$KY5ThvRVqPQ*z#8K>Y>oQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3 z;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK@cNB9_@ z;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8NZ&K6 zm=F`86DGzam=u#?a!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fzFr{b75}GgL%;f z^I?80fCaG-7RDl26pLYTEP*Al6qd#^SQg7+d8~jH(G@FUWvqf#u^Lv#8t8^K(H%Xo z7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEzuKOp%;3i5Bj1X`eSP}U;rAi4F+Nm24e_@ zVp|NuaE!o6Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq{c!+B;XoXOCLD}Ia3~JL z;Wz?EVlZzFARfZQcm$8) zF+7eZ@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKg zGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~x<<|gP+J6+Np$*!i9mYX>bU;Upi}5f%CcuQ4 z2%Ru7Cc&hb43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-9r2oR|x9V;;17pF*d=b*bJLv3v7v=*b2SS8-36h{m>s88#yz+f_u+m#fCupq9>ybh6p!I? zJb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9 ze1R|V6~4wd_!i&cd;EYO@e_W=FQ_D>{YQZs+Mq4kVH~ta2Xw@^7!TuP0!)aB&Js)Gh-IairFwb=0Io6iMcR0=E1z^g848%7Qlj7 z2n%BoEQ-ajIF`VYSPDyH87zzCusl}4is*`!urgM`s#p!HV-0k}n&^%mSPN@o9juG> zus$}xhS&%jV-swO&9FJPz?SHVtUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAhtaTzYh6}S>t z;c8riYjGW}#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri z;cI+@Z}A%!rvVGiJf8m<_XI4s^zxm)<8F`iSFouwXinU!Ma!v>th3K zh>fr@Ho>OY44Y#MY>A%O3cb)9eb5*E&>vf)0Rzy8Z7>jnFc?EH6x(7LhGPUqVmoY) z9k3&I!p_(QyJ9!&jya)K z7RTXuoPZN?5>Cb`I2EVibew@RaTdx4=M$CknF$-qJY?vK$pfl#gT$mg4U|w{=e3%~#U_mT|g|P@0#bQ_- zOJGSXg{83!mc?>d9xGr)bj3gMk=?!5D&}*cQVu93wCi+hKd`fE}?D zcE&E)6}w?~?14S87xu%QFG>*ZsI1b0- z1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj z2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w z1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW z2mFYi@H2ivB{A(k3e?aBZP582_QcQ-)F$Jc?RG1pm zU|LLv=`jOl#7vkOvtU-thS@O(I%7`Eg}E^g=0z9GhxxGp7Q{kW7>i(0EQZCg1eU~7 zSQ^Vw{^SFD7Uu?kkjYFHg>pc~dicl5woSR3nLU95-ou>m&3M%WmeU{h>{ z&9McxL{DsmUg(WJ=!<^nkFC*w0cgZF7>Gd_j3F3`Z7~ePF#;p89k#~~*bzHnXY7Jq zu^V>B9@rCmVQ=h%eX$?*#{n3H191?Va4-(Rp*ReO;|Lsy(Krf6;}{%^<8VAqz==2s zC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;G zH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZ zFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL| zKjRlvlFd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFH zhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}B zw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd3 z8q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@ zg`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7} zOo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ z+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<) z17^fbm>IKRR?LRkF$d z0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4 zhwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fb zm>IKRR?LRkF$d0#?LI zSQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTg zcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKR zR?LRkF$d0#?LISQ)Ee zRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED z8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRk zF$d0#?LISQ)EeRjh{9 zu?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot z?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(} zT38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU3 z2lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$F zU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X z*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<- z^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq z#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*c7)R4Xa}ftckU-HrBzqSP$!C18j(murW5l zrq~RdV+(AFA=nB-u{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK( zFb=_?I1Gp52pox{a5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj z2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm z2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g z2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{LjTc0 z7d;dxQK63-4F(tv!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~Rd zV+(AFA=nB-u{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_? zI1Gp52pox{a5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iL zT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{C zJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7U ze2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{O8?P87d;dx zQK63-4F(tv!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AF zA=nB-u{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp5 z2pox{a5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZ zIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`Hg zIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNT zIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{M*q=47d;dxQK63- z4F(tv!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFA=nB- zu{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{ za5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E! zxC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_h zcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP` z_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{PXEzC7d;dxQK63-4F(tv z!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFA=nB-u{E~A zw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp= zu{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R} z8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB z89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+ z8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{LI2S~7d;dxQK63-4F(tv!(#-D zh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFA=nB-u{E~Aw%88a zV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL z;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI( za6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U z@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ z@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{N&nG77d;dxQK63-4F(tv!(#-Dh*O4D z`~L-mbqQ+{5~D)bVPRFmLI>NHHL`1y&>{aBk}xDmS!-xmNV4Jo{Yq8VE*~l7(6G>F z^1mTP5>@!$*|z0auH{+6l9sZ3OIyYYtZ-I%D}oi#ieyE$qF7O_XjXJ9h85F_WyQAQ zSaGd*R(vagmC#CLCAN}SNv&j7ax2Au`;p2@ZKbi&TIsCxRt77hmC4F%WwEka*{tkV z4lAdX%gSx#vGQ8^to&91tDsfLDr^<8idx02;#LW(q*cl)ZI!XgTIHTeCO23mux!PXFKs5Q(QZjG=; zTBEGd));H7HO?AuO|T|fldQ>Bm^H|8_jn*b>v$e(AYHhQ&TRW_s)-G$ewa40P?X&h< z2dsnEA?vVp#5!snvyNLQtdrI$>$G*oI%}P?&RZ9(i`FIUvUSC}YF)FgTQ{tm)-CI{ zb;r7E-Lvjn53GmQBkQsC#CmEyvz}Wote4g+>$Ua9dTYJ2-di86kJcyav-QRLYJIc5 zTR*Iy)-UU~^~d^a{rm6uZQHS3+p~o&ZDsqmwv8Rw;q35s1UsS~$&PGCv7_42?C5q3 zJEk4Wj%~-WFo4&20Npj$su5H(`>)Q3~`gQ}mq20)CY&Wr++Rg0db_=_u9b&h# zL+#df8@sLD&TemaushnF?9O%=h)K58GckJ~5gllCe5w0*`tYoD{v+ZXJM_9gqWeZ{_NU$d{< zH|(4CE&H~8$G&Udv+vsv?1%Ov`?3AReri9npW83&m-Z|Bwf)9^YrnJK+aK(Y_9y$Z z{l)%jf3v^aKkT3OFZ;Ls$Np>o`|tAGj^ntF=LknS%JCiT7$cbkodixoCy|rbN#Z1Rk~ztp6i!Mfm6O^@+sWhPb@Dm+odQllr;t82{lvCO% znmNs# z7EVhi#A)S(I<1{HPFts))86UebaXm7ot-XDSErlP-Ra@GHaHudP0nU#i?h|)=4^L% zI6IwP&TeOqv)9?@>~{`02c1LCVdsc*)H&uHcTPAbom0+f=ZtgKIp>^rE;tvROU`BI zigVSu=3IAfI5(YJ&TZ$8bJw}&+;<*051mKOW9NzU)OqGScV0Lzomb9l=Z*8$dFQ-$ zJ~$tpPtIrOi}Tg_=6rX4I6s|V&Tr?B^Vj+J-_viqj_bOfD_rR+*LSsR+`tXzhIb>l z5#2~`WH*W%)s5yxcVoCQ-B@mHH;x@Nv8@LVKMs8!b ziQCj|<~Da*xGmifx0M^}wszaNZQXWmd$)tz(e31RcDuM;-EMAow};!)?dA4%`?!7G zer|tvfIH9~ox9%M;BIs`xtrZB z?pAl3yWQR4?sRv#yWKtRUU#3n-#y?SbPu_Q-6QT%_n3R!J>i~oPr0YvGwxaUoO|BA z;9hhuxtHB5?p61id)>X^-gIxdx7|DLUH6`Q-+kadbRW5o-6!r-_nG_Lec`@zU%9W{ zH||^ao%`PX;C^&Jxu4xH?pODl``!KF{&au2zuiCXU-#dC;kP}wEHAbf$BXO5^Wu96yo6pNFR_=zOX?-_l6xt< zlwK+?wU@?A>!tJ3dl|fpUM4TIm&MEKW%IIoIlP=+E-$y2$II*G^YVKIynVihCuzl3ppVv{%L}>y`7$dlkHjUL~)xSH-L9Rr9KQHN2W$Ew8p$$E)kr^XhvI zyoO#Qud&y}Yw9)gntLt0mR^Y0$_w>cdu_b7UOTV7*TL)Pb@DoUUA(SdH?O5cM6dt?#=LKdb7OQ-W+ePH_w~zE$|k4i@e3&5^t%u%v_h^Tkmb~ zHhP=9&E6JotGCVD?(Oh)db_;c-X3qSx6j+}9q`GML{HI6^cHq#Dz=I3Vu#o% zc8T3$kJu~riT&b$I4BN@!{UfIDvpWc;)FOUPKndvj5sUKiSy!uxF{}(%i@Z-Dz1s^ z;)b{>Zi(CCj<_rCiTmP#cqkr;$Kr{2DxQhw;)QrAUWwP@jd&~GiTC1z_$WS!&*F>t zD!z&D;)nPteu>}WkN7M82}|12k*@TlkWwn?OD&BIWH=dKMvxI@BpF#okx^wd8C}MZ zF=Z?nTgH)bWjq;QCXfkbBAHkwkx6ATnOvrjDP<~|TBea{WjdK&W{??WCYf1gky&Ln znO)|PIb|-HTjr5@Wj>i-7LWyHAz4@!kws-OSzMNoC1ojDT9%PzWjR@1R*)5CC0SWk zkyT|iSzXqUHDxVXTh@_vWj$G6HjoWvBiUFskxgYY*<7}eEoF#oB|~Lv*+#aN?PPn| zL3Wg#WM|n$c9q>^ciBVsl)Yqc*+=%3{bYYRKn|3Ho7^sU$enVR+%5OWy>g%2FAvCr@{l|%kI19)m^>~| z$dmGvJT1@2v+|rgFE7Z8@{+tPugI(Nn!GM=$eZ$(ye;p@yYimAFCWN<@{xQjpU9{3 znS3r^$d~e!d@bL|xAL8QFF(kS@{{~5zsRrhoBS?+$e;3;{4M{;zw)27l&u`)Do+U| zl~TUa%BVnvQ{hzv6;VY}kyR8GRYg1u|Wsb;C! zYL1$#=BfE=fm)~*sl{rETB??*Z})~WSsgW9Mzsm*GO+N!px?P`bG zsdlN|YLD8h_No2qfI6rSsl)1sI;xJTWn(8&Z+b2g1V?Ksmtn$x~i_J z>*|KOscxy;>W;dr?y39gfqJMOsmJPxda9nO=jw%esa~nq>WzA<-l_NMgZijGsn6<* z`l`OE@9Ky8seY;7>W})X{wd41eaClw&lkS*mGAr7H-6xU^TYcQ{D^)eKe8XikLpMB zqx&)Zn0_ojwjalj>&Nrs`w9Gnej-1ypTtk&xoPI7px1Yz)>*w?H`vv@hej&fGU&Jr!7xRnzCH#_pDZjK|#xLua z^UM1c{EB`hzp`J&uj*IxtNS(lntm<6wqM7u>(}$^`wjetej~rJ-^6d~H}jkOE&P^# zh~LT&^;`RG{I-5OzrEkV@91~(JNsSyu6{SayWhj_>G$$``+fYrem}pzKfoX85Ap~5 zL;RusFn_o|!XN36@<;n){IUKxf4o1zpXg8WC;MUk6o0Be&7bbi@Mrq7{Mr5-f382z zpYJd57y66*#r_h1slUu$?yvAy`m6la{u+O+zs_IpZ}2zzoBYlG7JsY1&EM|t@OS#V z{N4T@f3LsK-|rvr5Bi7v!~PNfsDI2q?w{~a`ltNU{u%$Qf6hPeU+^#bm;B5A75}P# z&A;y7@NfFJ{M-H=|E_<}zwbZrANr5{$Nm%lssGG>?!WL~`mg-g{u}?T|IUB!fABy0 zpZw4M7yqmO&HwKI@PGQh{NMf`|F8egx3sMt?P^a8Ew$3V*4pSmhtuJ81RYUF(vfu( z9aTru(RBw(`Q|HpTbsn8p=hOLh0bNiR(uH*qT~rs-#dQf?QkT-Dbs1e&m(%5S z1zk~B(v@`;T~$}p)pZSBQ`ge9bsb$-*VFZN1Km(J(v5W!-BdTz&2FIigo~dW)*?NwitLN$YdVyZ37wN@%iC(Ig z>E(KbUa42<)q0IytJmrEdV}7mH|foKi{7fY>Fs)l-l=!#-FlDStM}>s`hY&D59!1D zh(4;1>Err@KB-UX)B21)tIz54`hvcwFX_wrioU9^>FfH2zNv5N+xm{atMBRi`hk9^ zAL+;XiGHe|>F4@|eyLyS*ZPfqtKaGO`h)(cKk3iF@f7{;7ZI-};aKtN&@s z*v2ug@r*FiDB~M#j0sFQ6W&BH5lti$*+el>O*9kT#4s^UEEC(rF>y^i6W=5-2~8rC z*d#GYO)`_*q%bK>DwEozF=+UO*WI=9)G#$oEmPanF?CHn zQ{OZ&4NW7{*fcRsO*7Nnv@k7Ah-qa)O>5J}v^DKad(**mG@VRm)5UZ(-As4W!}K)0 zOmEZ2^fmoVe>1=gG=t1wGsFxv!_06q!i+Sd%xE*lj5XuTcr(FFG?UC^6K1BEsb-p) zZf2O7W|o<4=9sx=o|$hJn1yDMS!|Y=rDmB~ZdRC;W|diO)|jY&X}|2oH=hUn2Y9; zxoobOtLB=yZf=;H=9al_?wGsgp1E%xn1|+(d2F7Tr{==2XTV9LA)S-kRV7HBnlD-NrI$7vLJboB1jpe3Q`AYg0w-p zAbpS_$QWb_G6z|LtU(0SGzuCAO@gLD zv!HpK|8gvV~2R(wGL9d{9&?o2{^b7h2 z1A>9UpkQz?Bp4bD3x)?Hf|0?fV017h7#oZW#s?FEiNT~`au60w38n_qg6YAGU}i8Y zm>tXs<_7bE`N4uw@*chG1i`DcBrr3AP5? zg6+YMU}vx^*d6Q%_6GZc{lS6YU~nin92^Ob2FHTq!HM8xa4I+*oC(eb=YsRWh2Uav zDYzV539bg$g6qMJ;AU_uxEFmtT z%B;H-cXzkq?xjGXEzkmOad#Bp?F?s6YccFn|dxU;_uZ zzym%A0hvH%kOgD~*+6!X1LOp`KyHu+VUeS9;go*fQFzEXbhTwrl1*U4#GeS z5Dr>`R-iSA0Bt~95DB6{G-wCfgAO1D#Db2X6X*=$Ks-nQT|igR4Ri-RKu^#M^ag!E zU(gTq2Lr%BFbE6=L%>il3=9V&z(|k?MuE{_3>XW>f$?AhNCFeVBrq9F0aL*=FdfVQ zGr=q{8_WT7!8|Y@EC36^BCr@N0ZYL$upF!aE5RzT8ms|p!8))WYycamilQKeD4N0)L$MS` z@f4v1N~9!8rW8u0G)kuo%A_pHrX0$pJj$m+s7zF5DhrjB%0^|Ua!@&`TvTo<50#h7 zN9CsqPz9+%RAH(JRg@}56{kv2C8<(WX{roWmMTY;rz%htsY+C3stOfKRi&y?)u|d( zO{x}Eo2oPPjb22ca3LDXPs2sM-% zMh&M%P$Q{CY7{k^8bghx#!=&`2~-j_k(xwJrlwF+scF=7Y6dlvnnlf~=1_B~dDMJr z0kx1?L@lP4P)n(0)N*PCwUSyzt)|vcYpHeAdTIl;k=jIUrnXR9scqDDY6rEG+C}ZA z_E3ANebj#H0CkW$L>;D%P)Dg_)N$$rb&@(oouIQX_x<%cl?ofBBd(?gE0ril2L_MaSP*166)N|?u^^$r;y{6t!Z>e|Gd+G!Ak@`e^ zroK>Lsc+PG>Ie0c`bGVw0$?hb8m57PFf9y%>0o-80cM235I_n-h#(CyWFQMU$U_1J zC_)L!P=PAcpbiaaLJQi^fiCo*4?|!km>Fh)Sz$Jq9p->JVJ?^(=7D)(KA0aCfCXV8 zSQr+8MPV^m9F~A3VJTP|mVsqqIanT6fE8gSSQ%DRaf!S=8NjDfMRBkTk_!#EfZ z6JQtE6?TK&VGr07_JX}(AJ`Z6gZ<$EI1mnkgW(W36b^&K;RrYqCc;s0G#mrR!f|js zoB)&HL^ugfhEw2FI1NsRGvG`(3(kgf;9NKl&W8)&LbwPnhD+d5xC}0bE8t4F3a*B0 z;99s2u7?}oMz{%XhFjoPxD9THJK#>Z3+{${;9j^7?uQ59L3jurhDYF0cnltgC*VnV z3Z8~%;8}PMo`)CUMR*BbhF9QKcnw~MH{eZp3*Lrz;9YnR-iHt1L-+_jhEL#A_zXUW zFW^h~3ciMK;9K|(zK0*+NB9YThF{=U_zixCKj2UJ3;u=yC>2VL(x5<;76qYnC_T!6 zGNNDv5QQK@h(;JOh(#RY5kUeHk%VNVAQfpyM+P#Hg>2*?7kS7>At)2djIyAtC>zR- za-f_j7s`$Dpu8v_%8v@5f~XKGjEbP5s2D1aN}!Uc6e^9%pt7hODvv6lil`E*jH;ke zR25Z2)lm&p6V*bsQ5{ql)kF1B1Jn>TLXA-q)D$&C%~2R?fx=Ns)C#pm5vUDniy~1J zibm~Fd(;8NpjgxqbwZs{9EwK?s0-?fx}ol<2kMD>q28zu>WliJ{%8Oihz6m-Xb2jL zhN0nT1R9AF(I_+;jX`74I5ZwjKuKsKnuI2!DQGI1hNhz#XeOG4W}`W1E}DntqXlRo zT7(v(C1@#HhL)ohXeC;OR--j&En0`xqYY>y+JrWvEodv+hPI;}XeZi*cB4ILFWQIp zqXXz5I)o0RBj_kPhK{2X=p;IYPNOsEEINnIqYLOFx`ZyHE9fe^hOVO<=q9>_ZlgQs zF1m;AqX+0AdW0UMC+I19hMuDr=p}lEUZXeYEqaIEqYvmK`h-5CFX$`!hQ6a8=qLJx zexm?76`h(+LkH4n=^#2Cou1A>XQYE^KvOiN5lz#WW@wh?Xr3mtK#R0Q%d|qPv_|W+ zL7TKi+q6Twv`71N2%U+}OlP69(%I8f-!x;kBhu1VLTYtwb;x^z9dKHY$B zNH?Mz(@p56bThg+9Y(jH!|9fEE4nouLARmX(vfr&9Zk2R+tVHB7&?~jNOz(;({Xe> zoj`Y?yVBk0?sN~jC*6zgP4}Vu(*5ZE^Zuf z5&9^7j6P1Epik1L=+pEW`Ye5pK2KkuFVdIj%k&lcDt(QAKs zJJ`h@_HhW#gfrtTI4jPEv*R2%C(ea)<2*Po&WH2k0=OV9gbU*$xF{}$i{lcwBrb(Z z<1)A`E{DtG3b-P!ge&7JI22dK)o^uO1J}g0aBW-%*Twa4ecS*y#Eo!c+ypnp&2V!Z zhFjop+!D9Kt#JfygWKXr9EGECJKP?3z%e)$cf_4=XB>y)aRTmwyW(!RJMMvd;$FBn z?t}Z{ez-p#fCu71crYGY)wn8r*KrYX~mY0iW(EtqhoCDV#&%|tM5n6^wL6U9U` z?U?pV2PTGzWjZpQn9fWb6VD_tU6`&+H>Nw&gXziiVtO-un7&Lurav=)8ORJ`1~WsL zq0BI5I5UD7$s{tPn9EhW;Qd2naj*$ z<}(YJh0G#mF|&kO$}D4+Gb@;t%qnIzvxZsAtYg+Q8<>sECT26Uh1tq%W41Fpn4Qcn zW;e5k*~{!>_A>{VgUli3Fmr@C${b^kGbfmn%qiwHbA~y~oMX;27nqC8CFU}7g}KUH zW3Dqdn48Qk<~DPOxy#&R?lTXVhs-19G4q6Z$~SW4<##n4ioq<~I|-reagGY1lwEEgQt9W7D%4*o*&*yu zb{IRH9l?%d6WLMhXm$)cmL12AXD6^p>_m1FJDHurPGzUD)7cs9Om-GKo1MeXW#_T; z*#+!Eb`iUnUBWJ9m$A#)73@lO6}y^U!>(o5vFq6l>_&DIyP4g>Ze_Qz+u0rLPIec& zo880iW%sfB*#qoB_7HoRJ;EMkkFm$u6YNR$6nmOI!=7c&vFF(f>_zqxdzrn$US+Sb z*V!BFP4*Uho4v!{W$&@~*$3=H_7VG-eZoFvpRv!`7wk*+75kcf!@gzTvG3Ur>__$! z`A4JCMlP5G9K}HnaWscHhGRL7 z<2k|!oXAO>%qg78X`Id(oXJ_7%{iRQd7RIMaGAKwTox`XmyOHL<=}F1xwzb19xgAJ zkIT;$;0khuxWZf!t|(WGE6$bRN^+&R(p(v?ELV;z&sE?ma+SEsToo>qtIAd5s&h5C znp`ceHdlwM%hluRa}BtLTqCYA*Mw`zHRGCdVO$F?oNLLo;#zYNTpO+}7s*9&(Of&O zJ=cMY;bOUtTqmwG7sthO30xPhE7y(d&h_AWa=p0TTpzA4*N^MZ4d4cHgSf%m5N;?p zj2q64;6`$Z+$e4|H-;O_jpN316SyR9A~%Vf%uV5@a?`l!+zf6eH;bFi&Ee*9^SJrk z0&XF z%zfd$a^JY`+z;+2_lx_@1@Ni()O;E~kWb48@#*;Vd<*V`4`5Jsp zz7}7bufx~n>+$vZ27E)l5#N|^!Z+oc@y+=#z6Br7x8z&#t@#MP4d0fJ@KALftnNBLv?asC8`P7Goai`c{=F7b#@LP#c(nPeeZNj8$5r0A!SK9Ql3;G6-gyhnN%U6q$;UKs*@U|CaFbglRBg>sYmLQ z2BaZrL>iMOq$z1env*cnf`pTnq!npRB1jw3mPC>$5>48X_M`)eA+e+*=|nn{I1*11 zNEgzTbR*qK57LwLBE3l;(wFoj{mB3_kPIS&$q+J>3?swI2r`l+l2K$d8AHaBab!H1 zK$6HrGKowkQ^-^@jZ7yq$V@Ve%qDZlTr!W$Ckx0zvWP4uOUP2Pj4UTB$V#$`tR`#7 zTC$F;CmYB{vWaXaTgX?V82Ub2tuCkMzua)=xzN61lfj2tH?$VqaF zoF-?;S#pk?Cl|;?a*13fSIAXzja(-;$W3yK+$ML(U2>1yClAO&@`yYpPsmg9j65eV z$V>8yye4nRTk?*)Cm+a1@`-#VU&vSTjeI9R$WQW%{3ZcHDj~IyMhFzr3PD0TA-#}6 z$S4F0K%fLDAb}RJzzD3s3A{iAK@bH=kOf6h1x?TeLofwPumwkO1yAsW5FwM0S;!(} z6|xE0g&aaoA(xO_$Rp$x@(KBc0zyHdkWg4CA`}&h3B`pHLP?>NP+BM>loiSe<%J4D zMWK>VS*Rj}3RQ(_LUo~rP*bQS)E4Rpb%lCDeW8KSP-rAH7Mci6g=Ru?Axvl?gbOW& zRzhnbLTDqj6(WTwAzElBv==%EF+!}+QRpOe7UG0>AwlRObQQV@-Gv@PPobC4Tj(S7 z75WMNg#p4qVURFb7$OW6h6%%k5yD6zQ5YqR7RCr;g>k}oVS)v6lMvtg*n1pVV*EwSRgDE772@mCBjl+nXp_~A*>Wu39E%Q!dhXSuwK|8Y!o&L zn}sdHR$-g4UDzS)6m|)_g+0PvVV|&HI3OGp4he^aBf?SPm~dP;A)FLW38#fK!dc;* za9+3|Tof(|mxU|BRpFX&UAQ6K6mAK(g*(Ds;hu0`cpy9!9tn?yC&E+VnebeAA-oh` z39p4W!du~;@Lu>Jd=x$jpM@{NSK*uRUHBpV6n+W6g#a;?m|9FD28wCLATgboUd$k7 z6oW+|QX&+QNQ+oxL{{WPUL>L*ilQXSq9UrIChDRgnxZAzq9eMZC;DQDm`ThmW)ZWB z*~IK(4l$>gOUy0i5%Y@q#Qb6bv7lH;EG!lgi;Bg>;$jK0q*zKUEtV0>isi)eVg<3H zSV^obRuMzRs$w;!T3Db^Bei*>}hVm-0G*g$M3HWC|)O~j^RGqJfCCbkg6#g<|# zv9%Z>wh`Nkkz$k>Ew&Teiyg!mF;?s-b`m>_abmofAa)VEirvKSVh^#W*h}m!_7VGv z{lxy_0CAu=NE|E<5r>My#Npxyaio|ijuJd?`fABxRPeNLi(9 zQg$halvBzj<(BeDd8K?(eyM;|P%0!9mWoJ4rD9TXsf1KgDkYVc%1C9Ua#DGzf>cqe zBvqEGNTE_yshU(>sv*^sYDu-FI#OM!o>X6IAT^X4NsXl@Qd6m!)LaUaT1eqiOR1IA zT8fa`NNuG^DN2f#+DYxD4pNL1D|M7QNu8xQDPBsDx=3B6Zc=xthtyN*CH0p2NPVS# zQh#ZHG*B8O4VH#TL#1KTaA|}zQc9FYNu#AP(pYJnG+vq@B}o&dNz!C#iZoT4CQX-S zNHe8b(rjstG*_A@&6gHP3#CQUVrhxAR9Yr2msUtCrB%{uX^pg2S|_cSHb@(#P10s* zi?mhRCT*8?NIRun(r#&wv{%|E?UxQn2c<*OVd;o;R5~Udmrh70rBl*r>5Oz%Iwzf% zE=U)pOVVZOigZ=FCS8|qNH?Wh(rxLEbXU43-IpFn52Z)aW9fomtIIOrB~8x z>5cSOdMCY?K1d&>Pts@Ui}Y3cCViKFNI#`t(r+n1P9>+7)5w8xS~*BgC#RP)$Qk8e z8OW3jWhB!wmKm9qIhmJ&oANFBwtPpvE8mmv%Mav-@+0}N{6u~#Ka-!!FXWf5li$l9K(G*=V6jQMjTX7Uu@f2SPQ8Fo+l`KkDC7Y66 z$)V&_aw)l$JW5_ApORlGpcGUJDTS3HN>QblQd}valvGM7rIj*DS*4s(Ua6o|R4OTz zl`2Z8QdOy@R99*!HI-UQZKaMUCqr@s5l}<`$B~FP~5|l1VSEZZMUFo6pRC+1Bl|D*erJvGY z8K4YQ1}TG;A<9r?m@-@$p^Q`#l~KxQWsEXb8K;a_CMZeDL}ijPS(&0tRi-J^l^M!R zWtK8qnWM~A<|*@)1qAXRGDa(}=%1ULGvRYZAtX0-2>y-`4MrD(-S=pj& zRkkVHl^x1XWtXyB*`w@L_9^?71Ij_=kaAc#q8wF@DaVx)%1Pyva#}f~oK?;#=amb} zMdgxmS-GNIRjw)5l^e=U<(6_=xue`w?kV?`2g*a`k@8r1qC8ceDbJM`%1h;y@>+SL zyj9*Q@0AbAN9B|9S^1)TRlX_Tl^@Da<(KkX2~bn1sns-Upqf?G@N4b+BeBek*GL~W`zQ=6+{Y6~@7ZK<|WTdNUj z8?~((sYa>MYCE;P+Chy`W7UpoC$+O0r^c%ZY8SPu+D+}Q_E3AOz0}@nAGNRAPwlS` zPzS1m)WPZyb*MT_9j=a0N2-bHD0Q?tMjfk;Q^%_l)FgGHI!T?ZPEn_-)70te40WbD zOP#IGQRk}j)cNWHb)mXQU92uqm#WLuIQYAx=G!vZc(?Y z+tlsq4t1xxOWm#RQTM9*)cxuK^`Lr4J**y4kE+MiIL

Ie0s`bqt)eo?=w-_-Bw5A~<|OZ}||XsNW+S{f}-OREKG>9q7(1}&o&tO1SE zpoTPB!y2Qp8mI9Z(F9G@Bu&;7P1Q6_*9^_nEX~#&&DA{3*Fv;RT4pVamQ~B9W!G|O zIkj9`ZY__NSIej6*9vF_wL)59t%z1sE2b6KN@yjuQd()Pj8;}FrvzzHPxDF&9yMCg%+;0)LLn+wFs?^ z)>ezuqO@qOoz`CKpv7phT1Tyu)>(_w;rGN_16Yy z1GPcgU~PytR2!xZ*G6a~wM1={Hd-5_jn&3!y z+IDS+wo}`s?bh~ad$oPqe(ivEP&=d@){baLwPV_G?SytxJEfi0&S+<~bJ}_Bf_726 zq+QmoXjips+I8)Qc2m2h-PZ1CceQ)keeHqvP33hflldA zM>?%zozYpH(|Miff-dTkF6)Y}>YA?WhHmPXZtITj>YncFA$le~vz|rIs%O))>pAqC zdM-V;o=4BC=hO4+1@wY?A-%9(L@%lr(~IjR^pbihy|i9NFRPc+%j*^Nih3ozvR*|G z)vM~&^y+#Iy{2AEudUb7>+1FN`g#Msq25SutT)k{>do}#dYIlq57%4jt@PGjU(G z`XGIi>f`kB`UE{mpQumLC+k!6srod1x;{gnsn619 z>vQzE`aFHUzCd57FVYw5OZ27sGJUzeLSLz`(pT$i^tJjreZ9Ux->7fWH|tyUt@<{7 zyS_u;sqfNv>wEOQ`aXTXen3B{AJPx&NA#omG5xrHLO-dW(ogGW^t1Xo{k(obzo=i* zFY8zItNJzlx_(2yso&CX>v#0K`aS)={y=}IKhhuTPxPnyGyS>#LVu~h(qHRu^tbvu z{k{G{|EPb`KkHxgulhIryZ%G}ssGY{>j6e8Bejvn2sF|fK}I?wy^+DlXapO;pbTgr zgEp|i7_7k=yg>}X5Dm$Y4aHCm&Cm_QFb&JF4aaZ|&+v^9Ba@NY$YNwQvKiTp97awf zmyz4ZW8^jR8TpL@MnR*HQP?PA6g7$&#f=h1Nu!ie+9+d`HOd*~jS5CZqmohCsA7Z~ zRgG#!b)$w+)2LJlG&UKVjV;DjW1F$v z*kSB6b{V^kJ;q*RpRwOKU>r0K8HbG{#!=&#aojjzoHR}ur;RhlS>v2>-nd{~G%gvJ zjVs1gZW0ure)fuW4fki`eulk$;@nKF|(T4%qz-(wXG8>ys%%)~Dv$+{&wlKrZmS!unwHaZyG25Dv zW|SFiwlmwC9n2Ur*6e6@GCP}bX1tkTb}_q}-OTQ053{G)%j|9TG5ebR%>L#8bD%lM z9Bd9ThnmC8;pPZ)q?u@rGDn+Z%(3P;bG$jhOfn~$lg!EH6mzOM&75w|FlU;x%-QA~ zbFMkhoNq2L7n+OA#pV)oskzKtZmuv_nybv!<{ERYxz1c~ZZJ2Ro6ODT7IUk)&D?J8 zFn5}}%-!Z5bFaD2+;1K*51NO}!{!n5sCmphZk{ktny1Xu<{9&>dCoj8$it1}mc#Yype1poJ{j z!WLt(7H9Dmu>?!BBulmwOSLphw+zd)EX%eW%e6eqw?eE;R%R=UmDS2-Ww&xzIjvk) zZYz(K*UD$*w+dJVtwL5|tB6(9DrOb8N?0YWQdViJj8)buXO*`qSQV{GR%NS-6>3$r zs#(>o8dgoKmQ~xTW7W0lS@o?3Rzs_i)!1rcHMN>q&8;x2g%xhKv|3rMtq7}))z*r% zqO54Eoz>pzV8vLmR!6Io)!B-(;;jU$i`CWYW_7oESUs&?R&T41)z|80^|uCC1Fb>U zU~7mq)EZ_Dw?%P zbFF#Sd~1QV&{||Iww72+t!377YlXGaT4k-a)>vz;b=G=ogSFAxWNo&#SX-@a)^=-$ zwbR;V?Y8zx6aEI%S=<&RA!ybJlt5f_2flWL>td zSXZrU)^+QKbxK2wdS$(~-dJy~ch-CB zgZ0t+WPP^2SYNGg)_3cN_0#%gCI86GPGzUI)7XJ_T06*2XQ#I_*ct6$8`zW$ZDi9n zwi%nXIh(hME!d(h*|M$Js;$|&ZP=!5*|zQ2uI<^r9b#v)Guv70tadg#yPd<%Y3H(Y z+j;D~c0N15UBE7A7qSc6MeL$>F}t{3!Y*l-vP;`#?6P(_yS!b&u4q@XE8A7~OoK-O6rlN7!xbwsxc) zWk=iX?DlpCJI0Q+JKCM>&UTy~ZztGY?5=hFSD21E9{l_DtooP#$Ic$v)9`j?2Yy&d$Ya8-fC~Nx7$1H zo%Sw!x4p;SYwxr7+Xw7}_96SQeZ)R$AG43!C+w5;<-z@Z%IAcuCa z!#J$NIlMz0!4VzFksZZR9nH}l!!aGpu^q>89nbNd5GRw9*~#K$b+S3xog7Y1Czq4k z$>Zd8@;UjP0!~4vkW<(x;uLj?ImMk4PD!VfQ`#xxly%BE<(&#nMW>Qe*{R}$I#r!& zPIae-Q`4#C)OPAPb)9-neW!ub&}rl}cA7X%on}sRC(LQ#ggY&rR!(at!fE5Qbt0W8 zC)#P}w0AlP;m7CDQZCC*Z3nX}wk;jDC4IjfyD&RS=kv)j*Ip7?04mpRNBhFFhm~-4Y;hc0%Ij5a7&ROT2bKbe&Ty!oumz^ul zRp**>-MQi1bZ$AfojcB5=bm%ldEh*B9yyPlC(cvnne*Iv;kKQeGrxvAYWZlIgi4RX`D>D>%&MmN|6F6BZOxwMO2 z#${d3?*G6YOd}YuIXB??K-aOdamz=xS8C{ZWcGIo6XJc=5TYmx!l}t z9yhO>&&}@^a0|MH+`?`Vx2RjpE$)_ZOS+}p(ry{ItXs}4?^bXtx|Q6@ZWTAwt?E{D ztGhManr(+DYyA9liZX>s`+r(|^HglW1VQvdI+->Q$a$CC*ZX36)8|g;5 z(QZ4pz1zW!abw+%ZYQ_18|TKm32qm+tJ}@(?)Gqdy1m@qZXdU=+t2Oq4sZv$gWSRH z5O=6M%pLBIa7Vg{?kIP(JH{RBj&sMm6Wk`rl~y3^e0?hJRPJIkHz&T;3u z^W6FF0(YUi$X)C%ahJNw+~w{Hccr_^UG1)M*ShQ6_3j3Dqr1u7>~3+ly4&3C?hbdS zyUX3}?s50J``rER0r#ML$UW>HagVyk+~e*E_oREuJ?)-x&${Q_^X>)rqI=1`>|SxN zy4T$6?hW^*d&|A;-f{1`_uTvL1NWi($bIZSai6-++~@8K_oe&FeeJ$+-@5PI_wEPx zqx;GI?0#{-y5HRI?hp5;`^)|926(Bw)Lt4d&`awDdFj0LUIs6t7wiF#@}P%2+QS~> zu^#8~9`OWE^dwLA6i@XuPxlPZ^eoTz9MAPU&-X&SOkQR$iDtHyWN?v8JiWllt^{RQ* zy&7Ikua;NatK-%6>Us6O23|w2k=NL3;x+Y}dCk2ruZ0)xwe(tft-T1Zjn~$T^rF0I zubtQ4>)^$Bv0g{7lh@gc^Wwb(uZ!2!>*jU$dU!p(US4mnkJs1h=k@mncmusb-e7Nt zH`E*E4fjTPBfUg#lsDQNnaUXnM_o8(RQrg&4mY2I{ihBwoj<<0iycyqmZ z-h6L?x6oVUE%ugpOTA^@a&LvV(p%-N_SSf7y>;GtZ-ckd+vIKbws>2;ZQgcohqu$) zs4q?}B&HyX0N=u6S3y zYuPrYZ}bMJ-s(tG8-_TG4Jy?5Sw?}PWz z`{aH0zIb1~Z{BzBhxgO_<^A>o{8WBwKaC&gr}cyUbbfk2gP+k4_JL3N&__P)W1sO^ zpYwU2_<}F`k}vy;ulky=`-X4&mT&ux@A{ta`yqZNKeM03&+2FMv->&xoPI7px1Yz) z>*w?H`vv@hej&fGU&Jr!7xRnzCH#_pDZjK|#xLua^UM1c{EB`hzp`J&5B011)%@yy z4Zo&e%dhR%@$35a{Q7!{GNUJRgW`y>33exg6hAMKCv$NJ;^@%{uq$)D&?@+bRK{Hgvlf4V=zpXtx?XZv&fx&AzV zzQ4d<=r8gY`%C<#{xW~LzrtVXuku&>Yy7qTI)A;t!Qbd_@;Cci{H^{rf4jfK-|6r2 zcl&$%z5YIbzkk3#=pXVA`$znv{xSc!f5JcMpYl)pXZ*ANIsd$W!N2HV@-O>W{Hy*o z|GIy}zv8I=$p)jmh(i1y*}QC%Wh zN3@R*PfDczd;OoSX}d>+caBR6O_dnbAvPvHGFc5s3~U|V{g0BqeMET2e->#YV-r%K z0g354M8zb;NBq^}B3i}9wEm-m4pFT-$NtL@n8Kb^Ju&Sc?qnrRGFh?+Orc7aL4S#o zsz*Ob9S~4GG5B8u$?kMrBO==U-H{tvoz$@CkmRZ) zr;;irC7}#|uSryoh@|9Ego)^X3m6s|7892gttX@ZRxsL1R-@hjUH>5ph|ZM47Z9EK zugLOOWc@3${T12&iX4AM&SX(NI@h0P?my2wDUP7%ynhydvW(96N6-J~Ss=xoDtS_3 z!DMAv^j~yha7=j2f9hCm4~Z`H2Nqp8S)}9=TqYqdDkdT>u3U0Cqwgg5Me@~(zMmok zLNkO0q+HNH{Q&*}R!S*&Z#K1qrosd*ED8rwDj!7v^kvyld za&pYFK?tT8|FQi~q~vHRQBqnW5tmJfi;wN_PrQ_aC5QX}SW{xA8lIT0T5OD2p>tGH z=IZ|({NHr2PDF>OWJr8unTWPgF)3XCw`oH1OCBDY;cuS*ftQbI{U^`=GW>;#f1nxv zoAuwzKd_W8PE7v?l*0Az#!UpZ62d#jM|A$rnf}}GH%ywpVg3jDhvRGVS0v@Ie_d(* z6z88WMq-9W$xZSf#(%B;(xv)Cmo5eLKe_!!{|f+r0D%pXdnG0Fzseui-;Ys-6i-;) zgyepW9-4BGQV$RPOB$TwsnO<7t|?ppYyI~B*#2FBpbCxh z=gFTph4&vdg)%vKP-wcu!1l>>e{W7u`D8WX&+Yt2{loL;MFjqHasMj+>j Date: Fri, 14 Aug 2020 12:50:20 -0700 Subject: [PATCH 46/51] CLN: remove unused variable (#35726) --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0306d4de2fc73..966773b7c6982 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -318,7 +318,7 @@ def __repr__(self) -> str: def __iter__(self): window = self._get_window(win_type=None) - blocks, obj = self._create_blocks(self._selected_obj) + _, obj = self._create_blocks(self._selected_obj) index = self._get_window_indexer(window=window) start, end = index.get_window_bounds( From 0658ce38754ffb3ef44121c2fe7a810a809b268c Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 14 Aug 2020 16:59:09 -0400 Subject: [PATCH 47/51] agg with list of non-aggregating functions (#35723) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/groupby/generic.py | 25 +++++++++++-------- pandas/core/groupby/groupby.py | 10 +++++--- .../tests/groupby/aggregate/test_aggregate.py | 13 ++++++++++ 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 85e2a335c55c6..565b4a014bd0c 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7280a9f7db3c..b806d9856d20f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -322,11 +322,14 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - output = self._wrap_aggregated_output(results) + output = self._wrap_aggregated_output(results, index=None) return self.obj._constructor_expanddim(output, columns=columns) + # TODO: index should not be Optional - see GH 35490 def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -335,7 +338,7 @@ def _wrap_series_output( ---------- output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. - index : pd.Index + index : pd.Index or None Index to apply to the output. Returns @@ -363,8 +366,11 @@ def _wrap_series_output( return result + # TODO: Remove index argument, use self.grouper.result_index, see GH 35490 def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -383,9 +389,7 @@ def _wrap_aggregated_output( In the vast majority of cases output will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - result = self._wrap_series_output( - output=output, index=self.grouper.result_index - ) + result = self._wrap_series_output(output=output, index=index) return self._reindex_output(result) def _wrap_transformed_output( @@ -1720,7 +1724,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1745,8 +1751,7 @@ def _wrap_aggregated_output( self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: - index = self.grouper.result_index - result.index = index + result.index = self.grouper.result_index if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4597afeeaddbf..0047877ef78ee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -974,7 +974,9 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index] + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): @@ -1049,7 +1051,7 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs @@ -1102,7 +1104,7 @@ def _python_agg_general( output[key] = maybe_cast_result(values[mask], result) - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -2534,7 +2536,7 @@ def _get_cythonized_result( raise TypeError(error_msg) if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: return self._wrap_transformed_output(output) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40a20c8210052..ce9d4b892d775 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1061,3 +1061,16 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +def test_nonagg_agg(): + # GH 35490 - Single/Multiple agg of non-agg function give same results + # TODO: agg should raise for functions that don't aggregate + df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + g = df.groupby("a") + + result = g.agg(["cumsum"]) + result.columns = result.columns.droplevel(-1) + expected = g.agg("cumsum") + + tm.assert_frame_equal(result, expected) From 6325a331b2c3a06609b7689c55fcc186a3cace7f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 14 Aug 2020 14:01:02 -0700 Subject: [PATCH 48/51] BLD: bump xlrd min version to 1.2.0 (#35728) --- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/compat/_optional.py | 2 +- pandas/tests/io/excel/test_readers.py | 43 +++++++------------------- 6 files changed, 16 insertions(+), 37 deletions(-) diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index 3ccb66e09fe7e..8000f3e6b9a9c 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -24,7 +24,7 @@ dependencies: - pytz=2017.3 - scipy - sqlalchemy=1.2.8 - - xlrd=1.1.0 + - xlrd=1.2.0 - xlsxwriter=1.0.2 - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index 94cc5812bcc10..05b1957198bc4 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -25,7 +25,7 @@ dependencies: - pytz=2017.3 - pyarrow=0.15 - scipy=1.2 - - xlrd=1.1.0 + - xlrd=1.2.0 - xlsxwriter=1.0.2 - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 7ab150394bf51..4c270117e079e 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -287,7 +287,7 @@ s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) xarray 0.12.0 pandas-like API for N-dimensional data xclip Clipboard I/O on linux -xlrd 1.1.0 Excel reading +xlrd 1.2.0 Excel reading xlwt 1.3.0 Excel writing xsel Clipboard I/O on linux zlib Compression for HDF5 diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a3bb6dfd86bd2..42f95d88d74ac 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -122,7 +122,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | xarray | 0.12.0 | X | +-----------------+-----------------+---------+ -| xlrd | 1.1.0 | | +| xlrd | 1.2.0 | X | +-----------------+-----------------+---------+ | xlsxwriter | 1.0.2 | X | +-----------------+-----------------+---------+ diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6423064732def..81eac490fe5b9 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -27,7 +27,7 @@ "tables": "3.4.3", "tabulate": "0.8.3", "xarray": "0.8.2", - "xlrd": "1.1.0", + "xlrd": "1.2.0", "xlwt": "1.2.0", "xlsxwriter": "0.9.8", "numba": "0.46.0", diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b610c5ec3a838..51fbbf836a03f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1,9 +1,7 @@ -import contextlib from datetime import datetime, time from functools import partial import os from urllib.error import URLError -import warnings import numpy as np import pytest @@ -14,22 +12,6 @@ from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm - -@contextlib.contextmanager -def ignore_xlrd_time_clock_warning(): - """ - Context manager to ignore warnings raised by the xlrd library, - regarding the deprecation of `time.clock` in Python 3.7. - """ - with warnings.catch_warnings(): - warnings.filterwarnings( - action="ignore", - message="time.clock has been deprecated", - category=DeprecationWarning, - ) - yield - - read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here @@ -134,21 +116,19 @@ def test_usecols_int(self, read_ext, df_ref): # usecols as int msg = "Passing an integer for `usecols`" with pytest.raises(ValueError, match=msg): - with ignore_xlrd_time_clock_warning(): - pd.read_excel( - "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3 - ) + pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3 + ) # usecols as int with pytest.raises(ValueError, match=msg): - with ignore_xlrd_time_clock_warning(): - pd.read_excel( - "test1" + read_ext, - sheet_name="Sheet2", - skiprows=[1], - index_col=0, - usecols=3, - ) + pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols=3, + ) def test_usecols_list(self, read_ext, df_ref): if pd.read_excel.keywords["engine"] == "pyxlsb": @@ -597,8 +577,7 @@ def test_sheet_name(self, read_ext, df_ref): df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc - with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) From a3f5c6a5a3f05edc1c56ab8051f28c4cf322c45c Mon Sep 17 00:00:00 2001 From: estasney Date: Fri, 14 Aug 2020 17:01:49 -0400 Subject: [PATCH 49/51] Fix broken link in cookbook.rst (#35729) --- doc/source/user_guide/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 49487ac327e73..7542e1dc7df6f 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -765,7 +765,7 @@ Timeseries `__ `Aggregation and plotting time series -`__ +`__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? From ed116557544a622ef295db45a5a00d2fa9f2cbc7 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Mon, 17 Aug 2020 09:50:00 +0100 Subject: [PATCH 50/51] CI: Min Pytest Cov Version/Restrict xdist version (#35754) --- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/azure-windows-38.yaml | 2 +- ci/deps/travis-37-cov.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 4d745454afcab..f4c238ab8b173 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index f428a6dadfaa2..1f383164b5328 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index 3a0827a16f97a..edc11bdf4ab35 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -10,7 +10,7 @@ dependencies: - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - - pytest-cov # this is only needed in the coverage build + - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies - beautifulsoup4 From 0abfc7e24702266820ae1e7888a8374151ebef04 Mon Sep 17 00:00:00 2001 From: sanderland <48946947+sanderland@users.noreply.github.com> Date: Mon, 17 Aug 2020 12:22:34 +0200 Subject: [PATCH 51/51] REGR: Fix interpolation on empty dataframe (#35543) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/generic.py | 3 +++ pandas/tests/frame/methods/test_interpolate.py | 7 +++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 565b4a014bd0c..b1fd76157b9f1 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`). - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2219d54477d9e..9cbe2f714fd57 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6892,6 +6892,9 @@ def interpolate( obj = self.T if should_transpose else self + if obj.empty: + return self + if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index ddb5723e7bd3e..3c9d79397e4bd 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -34,6 +34,13 @@ def test_interp_basic(self): expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) + def test_interp_empty(self): + # https://github.com/pandas-dev/pandas/issues/35598 + df = DataFrame() + result = df.interpolate() + expected = df + tm.assert_frame_equal(result, expected) + def test_interp_bad_method(self): df = DataFrame( {