From 385cff4b5ea497c9f767ab13a430a30fb4175fa9 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Nov 2021 08:01:15 -0700 Subject: [PATCH 01/22] First pass at multi-column quantiles --- pandas/core/frame.py | 114 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2b0f7a36b6fa2..bf257c46dbacd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10368,7 +10368,7 @@ def quantile( interpolation: str = "linear", ): """ - Return values at the given quantile over requested axis. + Return values at the given quantile over requested axis, per-column. Parameters ---------- @@ -10460,6 +10460,118 @@ def quantile( result = self._constructor(res) return result + def quantiles( + self, + q=0.5, + axis: Axis = 0, + numeric_only: bool = True, + interpolation: str = "nearest", + ): + """ + Return values at the given quantile over requested axis for all columns. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value between 0 <= q <= 1, the quantile(s) to compute. + axis : {0, 1, 'index', 'columns'}, default 0 + Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + numeric_only : bool, default True + If False, the quantile of datetime and timedelta data will be + computed as well. + interpolation : {'lower', 'higher', 'nearest'}, default 'nearest' + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + + Returns + ------- + Series or DataFrame + + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + See Also + -------- + core.window.Rolling.quantile: Rolling quantile. + numpy.percentile: Numpy function to compute the percentile. + + Examples + -------- + >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = pd.DataFrame({'A': [1, 2], + ... 'B': [pd.Timestamp('2010'), + ... pd.Timestamp('2011')], + ... 'C': [pd.Timedelta('1 days'), + ... pd.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False) + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + """ + validate_percentile(q) + + return_series = False + if not is_list_like(q): + return_series = True + q = [q] + + q = Index(q, dtype=np.float64) + data = self._get_numeric_data() if numeric_only else self + axis = self._get_axis_number(axis) + + if axis == 1: + data = data.T + + if len(data.columns) == 0: + # GH#23925 _get_numeric_data may have dropped all columns + cols = Index([], name=self.columns.name) + if is_list_like(q): + return self._constructor([], index=q, columns=cols) + return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) + + q_idx = np.quantile(np.arange(len(data)), q, interpolation=interpolation) + + by = data.columns.tolist() + if len(by) > 1: + keys = [data._get_label_or_level_values(x) for x in by] + indexer = lexsort_indexer(keys) + else: + by = by[0] + k = data._get_label_or_level_values(by) + indexer = nargsort(k) + + res = data._mgr.take(indexer[q_idx], verify=False) + + result = self._constructor(res) + if return_series: + result = result.T.iloc[:, 0] + result.name = q[0] + else: + result.index = q + + return result + @doc(NDFrame.asfreq, **_shared_doc_kwargs) def asfreq( self, From 2c55c68d984e9334e1f7b3756bbfb05ac86f8120 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 3 Nov 2021 09:13:24 -0700 Subject: [PATCH 02/22] Update docstring --- pandas/core/frame.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf257c46dbacd..cb8d1d06a2220 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10477,8 +10477,8 @@ def quantiles( axis : {0, 1, 'index', 'columns'}, default 0 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. numeric_only : bool, default True - If False, the quantile of datetime and timedelta data will be - computed as well. + If False, datetime and timedelta data will be included in the + quantile computation. interpolation : {'lower', 'higher', 'nearest'}, default 'nearest' This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -10504,16 +10504,16 @@ def quantiles( Examples -------- - >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + >>> df = pd.DataFrame(np.array([[1, 10], [1, 2], [2, 100], [2, 50]]), ... columns=['a', 'b']) - >>> df.quantile(.1) - a 1.3 - b 3.7 - Name: 0.1, dtype: float64 - >>> df.quantile([.1, .5]) - a b - 0.1 1.3 3.7 - 0.5 2.5 55.0 + >>> df.quantiles(.1) + a 1 + b 2 + Name: 0.1, dtype: int64 + >>> df.quantiles([.1, .5]) + a b + 0.1 1 2 + 0.5 2 50 Specifying `numeric_only=False` will also compute the quantile of datetime and timedelta data. @@ -10523,10 +10523,10 @@ def quantiles( ... pd.Timestamp('2011')], ... 'C': [pd.Timedelta('1 days'), ... pd.Timedelta('2 days')]}) - >>> df.quantile(0.5, numeric_only=False) - A 1.5 - B 2010-07-02 12:00:00 - C 1 days 12:00:00 + >>> df.quantiles(0.5, numeric_only=False) + A 1 + B 2010-01-01 00:00:00 + C 1 days 00:00:00 Name: 0.5, dtype: object """ validate_percentile(q) From 4aae08c0a8f4ccefbcde2e95a284be5d5e210c5d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 16 Nov 2021 07:11:04 -0800 Subject: [PATCH 03/22] Add tests for table quantiles --- pandas/tests/frame/methods/test_quantile.py | 559 +++++++++++++++++++- 1 file changed, 555 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index f341014110e18..bb088715efd76 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -232,6 +232,7 @@ def test_quantile_multi(self): expected = DataFrame( [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2] ) + tm.assert_frame_equal(result, expected) # empty result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0) @@ -245,7 +246,8 @@ def test_quantile_datetime(self): # exclude datetime result = df.quantile(0.5) - expected = Series([2.5], index=["b"]) + expected = Series([2.5], index=["b"], name=0.5) + tm.assert_series_equal(result, expected) # datetime result = df.quantile(0.5, numeric_only=False) @@ -280,9 +282,13 @@ def test_quantile_datetime(self): tm.assert_frame_equal(result, expected) # empty when numeric_only=True - # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # result = df[['a', 'c']].quantile(.5) - # result = df[['a', 'c']].quantile([.5]) + result = df[["a", "c"]].quantile(0.5) + expected = Series([], index=[], name=0.5) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantile([0.5]) + expected = DataFrame([], index=[0.5]) + tm.assert_frame_equal(result, expected) def test_quantile_invalid(self, datetime_frame): msg = "percentiles should all be in the interval \\[0, 1\\]" @@ -538,6 +544,551 @@ def test_quantile_item_cache(self, using_array_manager): assert df.iloc[0, 0] == df["A"][0] +class TestDataFrameMultiQuantile: + @pytest.mark.parametrize( + "df,expected", + [ + [ + DataFrame( + { + 0: Series(pd.arrays.SparseArray([1, 2, 3])), + 1: Series(pd.arrays.SparseArray([4, 5, 6])), + } + ), + Series([2, 5], name=0.5), + ], + [ + DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), + Series([2.0], name=0.5), + ], + ], + ) + def test_quantile_sparse(self, df, expected): + # GH#17198 + # GH#24600 + result = df.quantiles() + + tm.assert_series_equal(result, expected) + + def test_quantile(self, datetime_frame): + from numpy import percentile + + df = datetime_frame + q = df.quantiles(0.1, axis=0) + assert ( + q["A"] + == percentile(df.to_records(index=False), 10, interpolation="nearest")[0] + ) + tm.assert_index_equal(q.index, df.columns) + + q = df.quantiles(0.9, axis=1) + assert ( + q["2000-01-03"] + == percentile(df.T.to_records(index=False), 90, interpolation="nearest")[0] + ) + tm.assert_index_equal(q.index, df.index) + + # test degenerate case + q = DataFrame({"x": [], "y": []}).quantiles(0.1, axis=0) + assert np.isnan(q["x"]) and np.isnan(q["y"]) + + # non-numeric exclusion + df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) + rs = df.quantiles(0.5, interpolation="lower") + with tm.assert_produces_warning(FutureWarning, match="Select only valid"): + xp = df.median().astype(int).rename(0.5) + tm.assert_series_equal(rs, xp) + + # axis + df = DataFrame( + {"A": [1, 2, 3], "B": [2, 3, 4], "C": [4, 5, 6]}, index=[1, 2, 3] + ) + result = df.quantiles(0.5, axis=1) + expected = Series([2, 3, 4], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + result = df.quantiles([0.5, 0.75], axis=1) + expected = DataFrame({1: [2, 4], 2: [3, 5], 3: [4, 6]}, index=[0.5, 0.75]) + tm.assert_frame_equal(result, expected, check_index_type=True) + + # We may want to break API in the future to change this + # so that we exclude non-numeric along the same axis + # See GH #7312 + df = DataFrame([[1, 2, 3], ["a", "b", 4]]) + result = df.quantiles(0.5, axis=1) + expected = Series([3, 4], index=[0, 1], name=0.5) + tm.assert_series_equal(result, expected) + + def test_quantile_date_range(self): + # GH 2460 + + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + ser = Series(dti) + df = DataFrame(ser) + + result = df.quantiles(numeric_only=False) + expected = Series(["2016-01-02"], name=0.5, dtype="datetime64[ns, US/Pacific]") + + tm.assert_series_equal(result, expected) + + def test_quantile_axis_mixed(self): + + # mixed on axis=1 + df = DataFrame( + { + "A": [1, 2, 3], + "B": [2.0, 3.0, 4.0], + "C": pd.date_range("20130101", periods=3), + "D": ["foo", "bar", "baz"], + } + ) + result = df.quantiles(0.5, axis=1) + expected = Series([1.0, 2.0, 3.0], name=0.5) + tm.assert_series_equal(result, expected) + + # must raise + msg = "'values' is not ordered" + with pytest.raises(TypeError, match=msg): + df.quantiles(0.5, axis=1, numeric_only=False) + + def test_quantile_axis_parameter(self): + # GH 9543/9544 + + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + + result = df.quantiles(0.5, axis=0) + + expected = Series([2, 3], index=["A", "B"], name=0.5) + tm.assert_series_equal(result, expected) + + expected = df.quantiles(0.5, axis="index") + tm.assert_series_equal(result, expected) + + result = df.quantiles(0.5, axis=1) + + expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + result = df.quantiles(0.5, axis="columns") + tm.assert_series_equal(result, expected) + + msg = "No axis named -1 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.quantiles(0.1, axis=-1) + msg = "No axis named column for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.quantiles(0.1, axis="column") + + def test_quantile_interpolation(self): + # see gh-10174 + + # interpolation method other than default nearest + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantiles(0.5, axis=1, interpolation="higher") + expected = Series([2, 3, 4], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + # cross-check interpolation=higher results in original dtype + exp = np.percentile( + np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="higher" + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64") + tm.assert_series_equal(result, expected) + + # float + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) + result = df.quantiles(0.5, axis=1, interpolation="higher") + expected = Series([2.0, 3.0, 4.0], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + exp = np.percentile( + np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), + 0.5, + axis=0, + interpolation="higher", + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64") + tm.assert_series_equal(result, expected) + + # axis + result = df.quantiles([0.5, 0.75], axis=1, interpolation="lower") + expected = DataFrame( + {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] + ) + tm.assert_frame_equal(result, expected) + + # test degenerate case + df = DataFrame({"x": [], "y": []}) + q = df.quantiles(0.1, axis=0, interpolation="higher") + assert np.isnan(q["x"]) and np.isnan(q["y"]) + + # multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantiles([0.25, 0.5], interpolation="lower") + + # https://github.com/numpy/numpy/issues/7163 + expected = DataFrame( + [[1, 1, 1], [2, 2, 2]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_interpolation_datetime(self, datetime_frame): + # see gh-10174 + + # interpolation = nearest (default case) + df = datetime_frame + q = df.quantiles(0.1, axis=0) + assert ( + q["A"] + == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] + ) + + def test_quantile_interpolation_int(self, int_frame): + # see gh-10174 + + df = int_frame + # interpolation = nearest (default case) + q = df.quantiles(0.1) + assert ( + q["A"] + == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] + ) + + # test with and without interpolation keyword + q1 = df.quantiles(0.1, axis=0, interpolation="nearest") + assert ( + q1["A"] + == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] + ) + tm.assert_series_equal(q, q1) + + def test_quantile_multi(self): + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantiles([0.25, 0.5]) + expected = DataFrame( + [[1, 1, 1], [2, 2, 2]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.quantiles([0.25, 0.5], axis=1) + expected = DataFrame( + [[1, 2, 3], [1, 2, 3]], index=[0.25, 0.5], columns=[0, 1, 2] + ) + tm.assert_frame_equal(result, expected) + + # empty + result = DataFrame({"x": [], "y": []}).quantiles([0.1, 0.9], axis=0) + expected = DataFrame( + {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_datetime(self): + df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) + + # exclude datetime + result = df.quantiles(0.5) + expected = Series([0], index=["b"], name=0.5) + tm.assert_series_equal(result, expected) + + # datetime + result = df.quantiles(0.5, numeric_only=False) + expected = Series([Timestamp("2010-01-01"), 0], index=["a", "b"], name=0.5) + tm.assert_series_equal(result, expected) + + # datetime w/ multi + result = df.quantiles([0.5], numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-01-01"), 0]], index=[0.5], columns=["a", "b"] + ) + tm.assert_frame_equal(result, expected) + + # axis = 1 + df["c"] = pd.to_datetime(["2011", "2012"]) + result = df[["a", "c"]].quantiles(0.5, axis=1, numeric_only=False) + expected = Series( + [Timestamp("2010-01-01"), Timestamp("2011-01-01")], + index=[0, 1], + name=0.5, + ) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantiles([0.5], axis=1, numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-01-01"), Timestamp("2011-01-01")]], + index=[0.5], + columns=[0, 1], + ) + tm.assert_frame_equal(result, expected) + + # empty when numeric_only=True + result = df[["a", "c"]].quantiles(0.5) + expected = Series([], index=[], name=0.5) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantiles([0.5]) + expected = DataFrame([], index=[0.5]) + tm.assert_frame_equal(result, expected) + + def test_quantile_invalid(self, datetime_frame): + msg = "percentiles should all be in the interval \\[0, 1\\]" + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with pytest.raises(ValueError, match=msg): + datetime_frame.quantiles(invalid) + + def test_quantile_box(self): + df = DataFrame( + { + "A": [ + Timestamp("2011-01-01"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ], + "B": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + } + ) + + res = df.quantiles(0.5, numeric_only=False) + + exp = Series( + [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=["A", "B", "C"], + ) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5], numeric_only=False) + exp = DataFrame( + [ + [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(res, exp) + + # DatetimeLikeBlock may be consolidated and contain NaT in different loc + df = DataFrame( + { + "A": [ + Timestamp("2011-01-01"), + pd.NaT, + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ], + "a": [ + Timestamp("2011-01-01"), + Timestamp("2011-01-02"), + pd.NaT, + Timestamp("2011-01-03"), + ], + "B": [ + Timestamp("2011-01-01", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + ], + "b": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + "c": [ + pd.NaT, + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + }, + columns=list("AaBbCc"), + ) + + res = df.quantiles(0.5, numeric_only=False) + exp = Series( + [ + Timestamp("2011-01-03"), + Timestamp("2011-01-03"), + Timestamp("2011-01-03", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + pd.NaT, + pd.Timedelta("3 days"), + ], + name=0.5, + index=list("AaBbCc"), + ) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5], numeric_only=False) + exp = DataFrame( + [ + [ + Timestamp("2011-01-03"), + Timestamp("2011-01-03"), + Timestamp("2011-01-03", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + pd.NaT, + pd.Timedelta("3 days"), + ] + ], + index=[0.5], + columns=list("AaBbCc"), + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantiles(0.5) + exp = Series([3.0, 3.0], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [3.0, 4.0]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantiles(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + exp.iloc[1, -1] = np.nan + tm.assert_frame_equal(res, exp) + + # full-nan column + df["b"] = np.nan + + res = df.quantiles(0.5) + exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantiles(0.5, numeric_only=False) + exp = Series([pd.NaT], index=["a"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5], numeric_only=False) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame( + { + "a": [ + Timestamp("2012-01-01"), + Timestamp("2012-01-02"), + Timestamp("2012-01-03"), + ], + "b": [pd.NaT, pd.NaT, pd.NaT], + } + ) + + res = df.quantiles(0.5, numeric_only=False) + exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5], numeric_only=False) + exp = DataFrame( + [[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty_no_rows(self): + + # floats + df = DataFrame(columns=["a", "b"], dtype="float64") + + res = df.quantiles(0.5) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantiles([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantiles(0.5, axis=1) + # res = df.quantiles([0.5], axis=1) + + # ints + df = DataFrame(columns=["a", "b"], dtype="int64") + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantiles(0.5) + + # datetimes + df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantiles(0.5, numeric_only=False) + + def test_quantile_empty_no_columns(self): + # GH#23925 _get_numeric_data may drop all columns + df = DataFrame(pd.date_range("1/1/18", periods=5)) + df.columns.name = "captain tightpants" + result = df.quantiles(0.5) + expected = Series([], index=[], name=0.5, dtype=np.float64) + expected.index.name = "captain tightpants" + tm.assert_series_equal(result, expected) + + result = df.quantiles([0.5]) + expected = DataFrame([], index=[0.5], columns=[]) + expected.columns.name = "captain tightpants" + tm.assert_frame_equal(result, expected) + + def test_quantile_item_cache(self, using_array_manager): + # previous behavior incorrect retained an invalid _item_cache entry + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) + df["D"] = df["A"] * 2 + ser = df["A"] + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + df.quantiles(numeric_only=False) + ser.values[0] = 99 + + assert df.iloc[0, 0] == df["A"][0] + + class TestQuantileExtensionDtype: # TODO: tests for axis=1? # TODO: empty case? might as well do dt64 and td64 here too From 8cc274f80845caa946837f36d70396cd9e431bcd Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 16 Nov 2021 08:58:01 -0800 Subject: [PATCH 04/22] Migrate quantiles code to quantile(method='table') --- pandas/core/frame.py | 149 +++++------------- pandas/tests/frame/methods/test_quantile.py | 162 +++++++++++++------- 2 files changed, 140 insertions(+), 171 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb8d1d06a2220..9ca9a6d7a8477 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10366,9 +10366,10 @@ def quantile( axis: Axis = 0, numeric_only: bool = True, interpolation: str = "linear", + method: str = "single", ): """ - Return values at the given quantile over requested axis, per-column. + Return values at the given quantile over requested axis. Parameters ---------- @@ -10389,6 +10390,10 @@ def quantile( * higher: `j`. * nearest: `i` or `j` whichever is nearest. * midpoint: (`i` + `j`) / 2. + method : {'single', 'table'}, default 'single' + Whether to compute quantiles per-column ('single') or over all columns + ('table'). When 'table', the only allowed interpolation methods are + 'nearest', 'lower', and 'higher'. Returns ------- @@ -10418,6 +10423,17 @@ def quantile( 0.1 1.3 3.7 0.5 2.5 55.0 + Specifying `method='table'` will compute the quantile over all columns. + + >>> df.quantile(.1, method="table", interpolation="nearest") + a 1 + b 1 + Name: 0.1, dtype: int64 + >>> df.quantile([.1, .5], method="table", interpolation="nearest") + a b + 0.1 1 1 + 0.5 3 100 + Specifying `numeric_only=False` will also compute the quantile of datetime and timedelta data. @@ -10437,7 +10453,11 @@ def quantile( if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here res = self.quantile( - [q], axis=axis, numeric_only=numeric_only, interpolation=interpolation + [q], + axis=axis, + numeric_only=numeric_only, + interpolation=interpolation, + method=method, ) return res.iloc[0] @@ -10455,121 +10475,26 @@ def quantile( return self._constructor([], index=q, columns=cols) return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) - res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) - - result = self._constructor(res) - return result - - def quantiles( - self, - q=0.5, - axis: Axis = 0, - numeric_only: bool = True, - interpolation: str = "nearest", - ): - """ - Return values at the given quantile over requested axis for all columns. - - Parameters - ---------- - q : float or array-like, default 0.5 (50% quantile) - Value between 0 <= q <= 1, the quantile(s) to compute. - axis : {0, 1, 'index', 'columns'}, default 0 - Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - numeric_only : bool, default True - If False, datetime and timedelta data will be included in the - quantile computation. - interpolation : {'lower', 'higher', 'nearest'}, default 'nearest' - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j`: - - * lower: `i`. - * higher: `j`. - * nearest: `i` or `j` whichever is nearest. - - Returns - ------- - Series or DataFrame - - If ``q`` is an array, a DataFrame will be returned where the - index is ``q``, the columns are the columns of self, and the - values are the quantiles. - If ``q`` is a float, a Series will be returned where the - index is the columns of self and the values are the quantiles. - - See Also - -------- - core.window.Rolling.quantile: Rolling quantile. - numpy.percentile: Numpy function to compute the percentile. - - Examples - -------- - >>> df = pd.DataFrame(np.array([[1, 10], [1, 2], [2, 100], [2, 50]]), - ... columns=['a', 'b']) - >>> df.quantiles(.1) - a 1 - b 2 - Name: 0.1, dtype: int64 - >>> df.quantiles([.1, .5]) - a b - 0.1 1 2 - 0.5 2 50 - - Specifying `numeric_only=False` will also compute the quantile of - datetime and timedelta data. - - >>> df = pd.DataFrame({'A': [1, 2], - ... 'B': [pd.Timestamp('2010'), - ... pd.Timestamp('2011')], - ... 'C': [pd.Timedelta('1 days'), - ... pd.Timedelta('2 days')]}) - >>> df.quantiles(0.5, numeric_only=False) - A 1 - B 2010-01-01 00:00:00 - C 1 days 00:00:00 - Name: 0.5, dtype: object - """ - validate_percentile(q) - - return_series = False - if not is_list_like(q): - return_series = True - q = [q] - - q = Index(q, dtype=np.float64) - data = self._get_numeric_data() if numeric_only else self - axis = self._get_axis_number(axis) + if method == "single": + res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) + elif method == "table": + q_idx = np.quantile(np.arange(len(data)), q, interpolation=interpolation) - if axis == 1: - data = data.T - - if len(data.columns) == 0: - # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) - if is_list_like(q): - return self._constructor([], index=q, columns=cols) - return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) - - q_idx = np.quantile(np.arange(len(data)), q, interpolation=interpolation) + by = data.columns.tolist() + if len(by) > 1: + keys = [data._get_label_or_level_values(x) for x in by] + indexer = lexsort_indexer(keys) + else: + by = by[0] + k = data._get_label_or_level_values(by) + indexer = nargsort(k) - by = data.columns.tolist() - if len(by) > 1: - keys = [data._get_label_or_level_values(x) for x in by] - indexer = lexsort_indexer(keys) + res = data._mgr.take(indexer[q_idx], verify=False) + res.axes[1] = q else: - by = by[0] - k = data._get_label_or_level_values(by) - indexer = nargsort(k) - - res = data._mgr.take(indexer[q_idx], verify=False) + raise ValueError(f"Invalid quantiles method: {method}") result = self._constructor(res) - if return_series: - result = result.T.iloc[:, 0] - result.name = q[0] - else: - result.index = q - return result @doc(NDFrame.asfreq, **_shared_doc_kwargs) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index bb088715efd76..986a5ad5d551a 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -566,7 +566,7 @@ class TestDataFrameMultiQuantile: def test_quantile_sparse(self, df, expected): # GH#17198 # GH#24600 - result = df.quantiles() + result = df.quantile(interpolation="nearest", method="table") tm.assert_series_equal(result, expected) @@ -574,14 +574,14 @@ def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame - q = df.quantiles(0.1, axis=0) + q = df.quantile(0.1, axis=0, interpolation="nearest", method="table") assert ( q["A"] == percentile(df.to_records(index=False), 10, interpolation="nearest")[0] ) tm.assert_index_equal(q.index, df.columns) - q = df.quantiles(0.9, axis=1) + q = df.quantile(0.9, axis=1, interpolation="nearest", method="table") assert ( q["2000-01-03"] == percentile(df.T.to_records(index=False), 90, interpolation="nearest")[0] @@ -589,12 +589,14 @@ def test_quantile(self, datetime_frame): tm.assert_index_equal(q.index, df.index) # test degenerate case - q = DataFrame({"x": [], "y": []}).quantiles(0.1, axis=0) + q = DataFrame({"x": [], "y": []}).quantile( + 0.1, axis=0, interpolation="nearest", method="table" + ) assert np.isnan(q["x"]) and np.isnan(q["y"]) # non-numeric exclusion df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) - rs = df.quantiles(0.5, interpolation="lower") + rs = df.quantile(0.5, interpolation="lower", method="table") with tm.assert_produces_warning(FutureWarning, match="Select only valid"): xp = df.median().astype(int).rename(0.5) tm.assert_series_equal(rs, xp) @@ -603,11 +605,13 @@ def test_quantile(self, datetime_frame): df = DataFrame( {"A": [1, 2, 3], "B": [2, 3, 4], "C": [4, 5, 6]}, index=[1, 2, 3] ) - result = df.quantiles(0.5, axis=1) + result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") expected = Series([2, 3, 4], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) - result = df.quantiles([0.5, 0.75], axis=1) + result = df.quantile( + [0.5, 0.75], axis=1, interpolation="nearest", method="table" + ) expected = DataFrame({1: [2, 4], 2: [3, 5], 3: [4, 6]}, index=[0.5, 0.75]) tm.assert_frame_equal(result, expected, check_index_type=True) @@ -615,7 +619,7 @@ def test_quantile(self, datetime_frame): # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ["a", "b", 4]]) - result = df.quantiles(0.5, axis=1) + result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") expected = Series([3, 4], index=[0, 1], name=0.5) tm.assert_series_equal(result, expected) @@ -626,7 +630,9 @@ def test_quantile_date_range(self): ser = Series(dti) df = DataFrame(ser) - result = df.quantiles(numeric_only=False) + result = df.quantile( + numeric_only=False, interpolation="nearest", method="table" + ) expected = Series(["2016-01-02"], name=0.5, dtype="datetime64[ns, US/Pacific]") tm.assert_series_equal(result, expected) @@ -642,49 +648,55 @@ def test_quantile_axis_mixed(self): "D": ["foo", "bar", "baz"], } ) - result = df.quantiles(0.5, axis=1) + result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") expected = Series([1.0, 2.0, 3.0], name=0.5) tm.assert_series_equal(result, expected) # must raise msg = "'values' is not ordered" with pytest.raises(TypeError, match=msg): - df.quantiles(0.5, axis=1, numeric_only=False) + df.quantile( + 0.5, axis=1, numeric_only=False, interpolation="nearest", method="table" + ) def test_quantile_axis_parameter(self): # GH 9543/9544 df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantiles(0.5, axis=0) + result = df.quantile(0.5, axis=0, interpolation="nearest", method="table") expected = Series([2, 3], index=["A", "B"], name=0.5) tm.assert_series_equal(result, expected) - expected = df.quantiles(0.5, axis="index") + expected = df.quantile( + 0.5, axis="index", interpolation="nearest", method="table" + ) tm.assert_series_equal(result, expected) - result = df.quantiles(0.5, axis=1) + result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) - result = df.quantiles(0.5, axis="columns") + result = df.quantile( + 0.5, axis="columns", interpolation="nearest", method="table" + ) tm.assert_series_equal(result, expected) msg = "No axis named -1 for object type DataFrame" with pytest.raises(ValueError, match=msg): - df.quantiles(0.1, axis=-1) + df.quantile(0.1, axis=-1, interpolation="nearest", method="table") msg = "No axis named column for object type DataFrame" with pytest.raises(ValueError, match=msg): - df.quantiles(0.1, axis="column") + df.quantile(0.1, axis="column", interpolation="nearest", method="table") def test_quantile_interpolation(self): # see gh-10174 # interpolation method other than default nearest df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantiles(0.5, axis=1, interpolation="higher") + result = df.quantile(0.5, axis=1, interpolation="higher", method="table") expected = Series([2, 3, 4], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) @@ -697,7 +709,7 @@ def test_quantile_interpolation(self): # float df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) - result = df.quantiles(0.5, axis=1, interpolation="higher") + result = df.quantile(0.5, axis=1, interpolation="higher", method="table") expected = Series([2.0, 3.0, 4.0], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) exp = np.percentile( @@ -710,7 +722,7 @@ def test_quantile_interpolation(self): tm.assert_series_equal(result, expected) # axis - result = df.quantiles([0.5, 0.75], axis=1, interpolation="lower") + result = df.quantile([0.5, 0.75], axis=1, interpolation="lower", method="table") expected = DataFrame( {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] ) @@ -718,12 +730,12 @@ def test_quantile_interpolation(self): # test degenerate case df = DataFrame({"x": [], "y": []}) - q = df.quantiles(0.1, axis=0, interpolation="higher") + q = df.quantile(0.1, axis=0, interpolation="higher", method="table") assert np.isnan(q["x"]) and np.isnan(q["y"]) # multi df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) - result = df.quantiles([0.25, 0.5], interpolation="lower") + result = df.quantile([0.25, 0.5], interpolation="lower", method="table") # https://github.com/numpy/numpy/issues/7163 expected = DataFrame( @@ -738,7 +750,7 @@ def test_quantile_interpolation_datetime(self, datetime_frame): # interpolation = nearest (default case) df = datetime_frame - q = df.quantiles(0.1, axis=0) + q = df.quantile(0.1, axis=0, interpolation="nearest", method="table") assert ( q["A"] == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] @@ -749,14 +761,14 @@ def test_quantile_interpolation_int(self, int_frame): df = int_frame # interpolation = nearest (default case) - q = df.quantiles(0.1) + q = df.quantile(0.1, interpolation="nearest", method="table") assert ( q["A"] == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] ) # test with and without interpolation keyword - q1 = df.quantiles(0.1, axis=0, interpolation="nearest") + q1 = df.quantile(0.1, axis=0, interpolation="nearest", method="table") assert ( q1["A"] == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] @@ -765,7 +777,7 @@ def test_quantile_interpolation_int(self, int_frame): def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) - result = df.quantiles([0.25, 0.5]) + result = df.quantile([0.25, 0.5], interpolation="nearest", method="table") expected = DataFrame( [[1, 1, 1], [2, 2, 2]], index=[0.25, 0.5], @@ -774,14 +786,18 @@ def test_quantile_multi(self): tm.assert_frame_equal(result, expected) # axis = 1 - result = df.quantiles([0.25, 0.5], axis=1) + result = df.quantile( + [0.25, 0.5], axis=1, interpolation="nearest", method="table" + ) expected = DataFrame( [[1, 2, 3], [1, 2, 3]], index=[0.25, 0.5], columns=[0, 1, 2] ) tm.assert_frame_equal(result, expected) # empty - result = DataFrame({"x": [], "y": []}).quantiles([0.1, 0.9], axis=0) + result = DataFrame({"x": [], "y": []}).quantile( + [0.1, 0.9], axis=0, interpolation="nearest", method="table" + ) expected = DataFrame( {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] ) @@ -791,17 +807,21 @@ def test_quantile_datetime(self): df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) # exclude datetime - result = df.quantiles(0.5) + result = df.quantile(0.5, interpolation="nearest", method="table") expected = Series([0], index=["b"], name=0.5) tm.assert_series_equal(result, expected) # datetime - result = df.quantiles(0.5, numeric_only=False) + result = df.quantile( + 0.5, numeric_only=False, interpolation="nearest", method="table" + ) expected = Series([Timestamp("2010-01-01"), 0], index=["a", "b"], name=0.5) tm.assert_series_equal(result, expected) # datetime w/ multi - result = df.quantiles([0.5], numeric_only=False) + result = df.quantile( + [0.5], numeric_only=False, interpolation="nearest", method="table" + ) expected = DataFrame( [[Timestamp("2010-01-01"), 0]], index=[0.5], columns=["a", "b"] ) @@ -809,7 +829,9 @@ def test_quantile_datetime(self): # axis = 1 df["c"] = pd.to_datetime(["2011", "2012"]) - result = df[["a", "c"]].quantiles(0.5, axis=1, numeric_only=False) + result = df[["a", "c"]].quantile( + 0.5, axis=1, numeric_only=False, interpolation="nearest", method="table" + ) expected = Series( [Timestamp("2010-01-01"), Timestamp("2011-01-01")], index=[0, 1], @@ -817,7 +839,9 @@ def test_quantile_datetime(self): ) tm.assert_series_equal(result, expected) - result = df[["a", "c"]].quantiles([0.5], axis=1, numeric_only=False) + result = df[["a", "c"]].quantile( + [0.5], axis=1, numeric_only=False, interpolation="nearest", method="table" + ) expected = DataFrame( [[Timestamp("2010-01-01"), Timestamp("2011-01-01")]], index=[0.5], @@ -826,11 +850,11 @@ def test_quantile_datetime(self): tm.assert_frame_equal(result, expected) # empty when numeric_only=True - result = df[["a", "c"]].quantiles(0.5) + result = df[["a", "c"]].quantile(0.5, interpolation="nearest", method="table") expected = Series([], index=[], name=0.5) tm.assert_series_equal(result, expected) - result = df[["a", "c"]].quantiles([0.5]) + result = df[["a", "c"]].quantile([0.5], interpolation="nearest", method="table") expected = DataFrame([], index=[0.5]) tm.assert_frame_equal(result, expected) @@ -838,7 +862,9 @@ def test_quantile_invalid(self, datetime_frame): msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): - datetime_frame.quantiles(invalid) + datetime_frame.quantile( + invalid, interpolation="nearest", method="table" + ) def test_quantile_box(self): df = DataFrame( @@ -861,7 +887,9 @@ def test_quantile_box(self): } ) - res = df.quantiles(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation="nearest", method="table" + ) exp = Series( [ @@ -874,7 +902,9 @@ def test_quantile_box(self): ) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation="nearest", method="table" + ) exp = DataFrame( [ [ @@ -931,7 +961,9 @@ def test_quantile_box(self): columns=list("AaBbCc"), ) - res = df.quantiles(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation="nearest", method="table" + ) exp = Series( [ Timestamp("2011-01-03"), @@ -946,7 +978,9 @@ def test_quantile_box(self): ) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation="nearest", method="table" + ) exp = DataFrame( [ [ @@ -969,19 +1003,19 @@ def test_quantile_nan(self): df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan - res = df.quantiles(0.5) + res = df.quantile(0.5, interpolation="nearest", method="table") exp = Series([3.0, 3.0], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5, 0.75]) + res = df.quantile([0.5, 0.75], interpolation="nearest", method="table") exp = DataFrame({"a": [3.0, 4.0], "b": [3.0, 4.0]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - res = df.quantiles(0.5, axis=1) + res = df.quantile(0.5, axis=1, interpolation="nearest", method="table") exp = Series(np.arange(1.0, 6.0), name=0.5) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5, 0.75], axis=1) + res = df.quantile([0.5, 0.75], axis=1, interpolation="nearest", method="table") exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) exp.iloc[1, -1] = np.nan tm.assert_frame_equal(res, exp) @@ -989,11 +1023,11 @@ def test_quantile_nan(self): # full-nan column df["b"] = np.nan - res = df.quantiles(0.5) + res = df.quantile(0.5, interpolation="nearest", method="table") exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5, 0.75]) + res = df.quantile([0.5, 0.75], interpolation="nearest", method="table") exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) @@ -1002,11 +1036,15 @@ def test_quantile_nat(self): # full NaT column df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) - res = df.quantiles(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation="nearest", method="table" + ) exp = Series([pd.NaT], index=["a"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation="nearest", method="table" + ) exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) tm.assert_frame_equal(res, exp) @@ -1022,11 +1060,15 @@ def test_quantile_nat(self): } ) - res = df.quantiles(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation="nearest", method="table" + ) exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation="nearest", method="table" + ) exp = DataFrame( [[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] ) @@ -1037,40 +1079,42 @@ def test_quantile_empty_no_rows(self): # floats df = DataFrame(columns=["a", "b"], dtype="float64") - res = df.quantiles(0.5) + res = df.quantile(0.5, interpolation="nearest", method="table") exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantiles([0.5]) + res = df.quantile([0.5], interpolation="nearest", method="table") exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) tm.assert_frame_equal(res, exp) # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # res = df.quantiles(0.5, axis=1) - # res = df.quantiles([0.5], axis=1) + # res = df.quantile(0.5, axis=1, interpolation="nearest", method="table") + # res = df.quantile([0.5], axis=1, interpolation="nearest", method="table") # ints df = DataFrame(columns=["a", "b"], dtype="int64") # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # res = df.quantiles(0.5) + # res = df.quantile(0.5, interpolation="nearest", method="table") # datetimes df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) - # res = df.quantiles(0.5, numeric_only=False) + # res = df.quantile( + # 0.5, numeric_only=False, interpolation="nearest", method="table" + # ) def test_quantile_empty_no_columns(self): # GH#23925 _get_numeric_data may drop all columns df = DataFrame(pd.date_range("1/1/18", periods=5)) df.columns.name = "captain tightpants" - result = df.quantiles(0.5) + result = df.quantile(0.5, interpolation="nearest", method="table") expected = Series([], index=[], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) - result = df.quantiles([0.5]) + result = df.quantile([0.5], interpolation="nearest", method="table") expected = DataFrame([], index=[0.5], columns=[]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) @@ -1083,7 +1127,7 @@ def test_quantile_item_cache(self, using_array_manager): if not using_array_manager: assert len(df._mgr.blocks) == 2 - df.quantiles(numeric_only=False) + df.quantile(numeric_only=False, interpolation="nearest", method="table") ser.values[0] = 99 assert df.iloc[0, 0] == df["A"][0] From ee28436dd07b6a7a76b4b006d1ce55744a7e68ed Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 17 Nov 2021 13:17:54 -0800 Subject: [PATCH 05/22] Add handling for degenerate case --- pandas/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ca9a6d7a8477..8d11c68e7a291 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10478,6 +10478,12 @@ def quantile( if method == "single": res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) elif method == "table": + # handle degenerate case + if len(data) == 0: + return self._constructor( + [], index=q, columns=data.columns, dtype=np.float64 + ) + q_idx = np.quantile(np.arange(len(data)), q, interpolation=interpolation) by = data.columns.tolist() From 259ff598f82fb8d22522596790f2929b2f620d91 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 18 Nov 2021 12:15:32 -0800 Subject: [PATCH 06/22] Fix incorrect assertion in test_quantile_multi --- pandas/tests/frame/methods/test_quantile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index dd1b1084e352c..2a03eda14e3f2 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -230,7 +230,7 @@ def test_quantile_multi(self): # axis = 1 result = df.quantile([0.25, 0.5], axis=1) expected = DataFrame( - [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2] + [[1.0, 2.0, 3.0]] * 2, index=[0.25, 0.5], columns=[0, 1, 2] ) tm.assert_frame_equal(result, expected) From c074638ea8df3536904ce039c730efd4f3040f3c Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 18 Nov 2021 12:25:13 -0800 Subject: [PATCH 07/22] Improve non-numeric exclusion test --- pandas/tests/frame/methods/test_quantile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 2a03eda14e3f2..ad31612e7b50e 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -618,10 +618,9 @@ def test_quantile(self, datetime_frame): # non-numeric exclusion df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) - rs = df.quantile(0.5, interpolation="lower", method="table") - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - xp = df.median().astype(int).rename(0.5) - tm.assert_series_equal(rs, xp) + result = df.quantile(0.5, interpolation="nearest", method="table") + expected = Series([3], index=["col2"], name=0.5) + tm.assert_series_equal(result, expected) # axis df = DataFrame( From ec24040de7ceeec74783bb70100e36516c186af8 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 18 Nov 2021 13:23:14 -0800 Subject: [PATCH 08/22] Resolve test_quantile_box failures --- pandas/tests/frame/methods/test_quantile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index ad31612e7b50e..e2fc22bfb149a 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1009,7 +1009,7 @@ def test_quantile_box(self): Timestamp("2011-01-03"), Timestamp("2011-01-03", tz="US/Eastern"), Timestamp("2011-01-03", tz="US/Eastern"), - pd.NaT, + np.timedelta64("NaT", "ns"), pd.Timedelta("3 days"), ] ], From 3b4247201b9e33f0d12febc3be2c52f2a72788f4 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 13 Jan 2022 10:37:06 -0500 Subject: [PATCH 09/22] Rename res to res_df --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c56c4f12a610e..fac2f789e5d5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10483,7 +10483,7 @@ def quantile( if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here - res = self.quantile( + res_df = self.quantile( [q], axis=axis, numeric_only=numeric_only, From e6229c6cf11fe42fcd70cee3a80f2d4bc972f4e1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 13 Jan 2022 11:23:42 -0500 Subject: [PATCH 10/22] Resolve sparse test failures --- pandas/core/frame.py | 6 +++++- pandas/tests/frame/methods/test_quantile.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fac2f789e5d5b..cf0c10853c155 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10490,7 +10490,11 @@ def quantile( interpolation=interpolation, method=method, ) - res = res_df.iloc[0] + try: + res = res_df.iloc[0] + except NotImplementedError: + # cannot iloc sparse arrays + res = res_df.T.iloc[:, 0] if axis == 1 and len(self) == 0: # GH#41544 try to get an appropriate dtype dtype = find_common_type(list(self.dtypes)) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 3bdca7c1d49d2..a741cceb24f60 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -604,11 +604,11 @@ class TestDataFrameMultiQuantile: 1: Series(pd.arrays.SparseArray([4, 5, 6])), } ), - Series([2, 5], name=0.5), + Series([2, 5], dtype="Sparse[int]", name=0.5), ], [ DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), - Series([2.0], name=0.5), + Series([2.0], dtype="Sparse[float]", name=0.5), ], ], ) From 54240eba760c49b5552131cdf2eaaa18f6fb2a6e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 13 Jan 2022 12:49:03 -0500 Subject: [PATCH 11/22] Remove try/except block to try and resolve new failures --- pandas/core/frame.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cf0c10853c155..fac2f789e5d5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10490,11 +10490,7 @@ def quantile( interpolation=interpolation, method=method, ) - try: - res = res_df.iloc[0] - except NotImplementedError: - # cannot iloc sparse arrays - res = res_df.T.iloc[:, 0] + res = res_df.iloc[0] if axis == 1 and len(self) == 0: # GH#41544 try to get an appropriate dtype dtype = find_common_type(list(self.dtypes)) From cec798faf24ba913621853b64c5c8109c05e1dce Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jan 2022 15:58:38 -0500 Subject: [PATCH 12/22] Check if tests resolve when we only use transpose to unwrap --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fac2f789e5d5b..28079f06b9c35 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10490,7 +10490,7 @@ def quantile( interpolation=interpolation, method=method, ) - res = res_df.iloc[0] + res = res_df.T.iloc[:, 0] if axis == 1 and len(self) == 0: # GH#41544 try to get an appropriate dtype dtype = find_common_type(list(self.dtypes)) From 04dbdfd108b3f261ea05f31a90f06b12a0a1656e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jan 2022 16:36:01 -0500 Subject: [PATCH 13/22] Add back in try / except block --- pandas/core/frame.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 28079f06b9c35..94a5fb54630af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10490,7 +10490,11 @@ def quantile( interpolation=interpolation, method=method, ) - res = res_df.T.iloc[:, 0] + try: + res = res_df.iloc[0] + except NotImplementedError: + # cannot directly iloc over sparse arrays + res = res_df.T.iloc[:, 0] if axis == 1 and len(self) == 0: # GH#41544 try to get an appropriate dtype dtype = find_common_type(list(self.dtypes)) From 7bf7d18206ce03e25e43560766fb18611c5874b7 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jan 2022 17:10:55 -0500 Subject: [PATCH 14/22] Use if / else instead of try / except --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 94a5fb54630af..5a3b3b080e529 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10490,9 +10490,9 @@ def quantile( interpolation=interpolation, method=method, ) - try: + if method == "single": res = res_df.iloc[0] - except NotImplementedError: + else: # cannot directly iloc over sparse arrays res = res_df.T.iloc[:, 0] if axis == 1 and len(self) == 0: From c9dd92f9ba62226b4f01084d7eab099276a76774 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jul 2022 16:13:30 -0700 Subject: [PATCH 15/22] Use pytest fixture to parameterize TestDataFrameQuantile --- pandas/core/frame.py | 32 +- pandas/tests/frame/methods/test_quantile.py | 856 +++++--------------- 2 files changed, 218 insertions(+), 670 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d924c1786b267..4bb10ce9031ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -80,7 +80,10 @@ npt, ) from pandas.compat._optional import import_optional_dependency -from pandas.compat.numpy import function as nv +from pandas.compat.numpy import ( + function as nv, + np_percentile_argname, +) from pandas.util._decorators import ( Appender, Substitution, @@ -11148,7 +11151,7 @@ def quantile( axis: Axis = 0, numeric_only: bool | lib.NoDefault = no_default, interpolation: str = "linear", - method: str = "single", + method: Literal["single", "table"] = "single", ): """ Return values at the given quantile over requested axis. @@ -11283,16 +11286,31 @@ def quantile( res = self._constructor([], index=q, columns=cols, dtype=dtype) return res.__finalize__(self, method="quantile") + valid_method = {"single", "table"} + if method not in valid_method: + raise ValueError( + f"Invalid method: {method}. Method must be in {valid_method}." + ) if method == "single": res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) elif method == "table": + valid_interpolation = {"nearest", "lower", "higher"} + if interpolation not in valid_interpolation: + raise ValueError( + f"Invalid interpolation: {interpolation}. " + f"Interpolation must be in {valid_interpolation}" + ) # handle degenerate case if len(data) == 0: - return self._constructor( - [], index=q, columns=data.columns, dtype=np.float64 - ) + if data.ndim == 2: + dtype = find_common_type(list(self.dtypes)) + else: + dtype = self.dtype + return self._constructor([], index=q, columns=data.columns, dtype=dtype) - q_idx = np.quantile(np.arange(len(data)), q, interpolation=interpolation) + q_idx = np.quantile( + np.arange(len(data)), q, **{np_percentile_argname: interpolation} + ) by = data.columns.tolist() if len(by) > 1: @@ -11305,8 +11323,6 @@ def quantile( res = data._mgr.take(indexer[q_idx], verify=False) res.axes[1] = q - else: - raise ValueError(f"Invalid quantiles method: {method}") result = self._constructor(res) return result.__finalize__(self, method="quantile") diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 3d003f0901a05..910c658b19806 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -13,6 +13,14 @@ import pandas._testing as tm +@pytest.fixture( + params=[["linear", "single"], ["nearest", "table"]], ids=lambda x: "-".join(x) +) +def interp_method(request): + """(interpolation, method) arguments for quantile""" + return request.param + + class TestDataFrameQuantile: @pytest.mark.parametrize( "non_num_col", @@ -22,8 +30,9 @@ class TestDataFrameQuantile: [DataFrame, Series, Timestamp], ], ) - def test_numeric_only_default_false_warning(self, non_num_col): + def test_numeric_only_default_false_warning(self, non_num_col, interp_method): # GH #7308 + interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}) df["C"] = non_num_col @@ -32,8 +41,10 @@ def test_numeric_only_default_false_warning(self, non_num_col): index=["A", "B"], name=0.5, ) + if interpolation == "nearest": + expected = expected.astype(np.int64) with tm.assert_produces_warning(FutureWarning, match="numeric_only"): - result = df.quantile(0.5) + result = df.quantile(0.5, interpolation=interpolation, method=method) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -61,66 +72,98 @@ def test_quantile_sparse(self, df, expected): tm.assert_series_equal(result, expected) - def test_quantile(self, datetime_frame): - from numpy import percentile - + def test_quantile(self, datetime_frame, interp_method): + interpolation, method = interp_method df = datetime_frame - q = df.quantile(0.1, axis=0, numeric_only=True) - assert q["A"] == percentile(df["A"], 10) + q = df.quantile( + 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method + ) + if method == "single": + assert q["A"] == np.percentile(df["A"], 10) tm.assert_index_equal(q.index, df.columns) - q = df.quantile(0.9, axis=1, numeric_only=True) - assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90) + q = df.quantile( + 0.9, axis=1, numeric_only=True, interpolation=interpolation, method=method + ) + if method == "single": + assert q["2000-01-17"] == np.percentile(df.loc["2000-01-17"], 90) tm.assert_index_equal(q.index, df.index) - # test degenerate case - q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0, numeric_only=True) + def test_empty(self, interp_method): + interpolation, method = interp_method + q = DataFrame({"x": [], "y": []}).quantile( + 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method + ) assert np.isnan(q["x"]) and np.isnan(q["y"]) - # non-numeric exclusion + def test_non_numeric_exclusion(self, interp_method): + interpolation, method = interp_method df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) - rs = df.quantile(0.5, numeric_only=True) + rs = df.quantile( + 0.5, numeric_only=True, interpolation=interpolation, method=method + ) with tm.assert_produces_warning(FutureWarning, match="Select only valid"): xp = df.median().rename(0.5) + if interpolation == "nearest": + xp = (xp + 0.5).astype(np.int64) tm.assert_series_equal(rs, xp) + def test_axis(self, interp_method): # axis + interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(0.5, axis=1) + result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) - result = df.quantile([0.5, 0.75], axis=1) + result = df.quantile( + [0.5, 0.75], axis=1, interpolation=interpolation, method=method + ) expected = DataFrame( {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75] ) + if interpolation == "nearest": + expected.iloc[0, :] -= 0.5 + expected.iloc[1, :] += 0.25 + expected = expected.astype(np.int64) tm.assert_frame_equal(result, expected, check_index_type=True) + def test_axis_numeric_only_true(self, interp_method): # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 + interpolation, method = interp_method df = DataFrame([[1, 2, 3], ["a", "b", 4]]) - result = df.quantile(0.5, axis=1, numeric_only=True) + result = df.quantile( + 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method + ) expected = Series([3.0, 4.0], index=[0, 1], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) - def test_quantile_date_range(self): + def test_quantile_date_range(self, interp_method): # GH 2460 - + interpolation, method = interp_method dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") ser = Series(dti) df = DataFrame(ser) - result = df.quantile(numeric_only=False) + result = df.quantile( + numeric_only=False, interpolation=interpolation, method=method + ) expected = Series( ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) tm.assert_series_equal(result, expected) - def test_quantile_axis_mixed(self): + def test_quantile_axis_mixed(self, interp_method): # mixed on axis=1 + interpolation, method = interp_method df = DataFrame( { "A": [1, 2, 3], @@ -129,8 +172,12 @@ def test_quantile_axis_mixed(self): "D": ["foo", "bar", "baz"], } ) - result = df.quantile(0.5, axis=1, numeric_only=True) + result = df.quantile( + 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method + ) expected = Series([1.5, 2.5, 3.5], name=0.5) + if interpolation == "nearest": + expected -= 0.5 tm.assert_series_equal(result, expected) # must raise @@ -138,30 +185,40 @@ def test_quantile_axis_mixed(self): with pytest.raises(TypeError, match=msg): df.quantile(0.5, axis=1, numeric_only=False) - def test_quantile_axis_parameter(self): + def test_quantile_axis_parameter(self, interp_method): # GH 9543/9544 - + interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(0.5, axis=0) + result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) expected = Series([2.0, 3.0], index=["A", "B"], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) - expected = df.quantile(0.5, axis="index") + expected = df.quantile( + 0.5, axis="index", interpolation=interpolation, method=method + ) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) - result = df.quantile(0.5, axis=1) + result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) - result = df.quantile(0.5, axis="columns") + result = df.quantile( + 0.5, axis="columns", interpolation=interpolation, method=method + ) tm.assert_series_equal(result, expected) msg = "No axis named -1 for object type DataFrame" with pytest.raises(ValueError, match=msg): - df.quantile(0.1, axis=-1) + df.quantile(0.1, axis=-1, interpolation=interpolation, method=method) msg = "No axis named column for object type DataFrame" with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") @@ -244,25 +301,37 @@ def test_quantile_interpolation_int(self, int_frame): assert q1["A"] == np.percentile(df["A"], 10) tm.assert_series_equal(q, q1) - def test_quantile_multi(self): + def test_quantile_multi(self, interp_method): + interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) - result = df.quantile([0.25, 0.5]) + result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method) expected = DataFrame( [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=["a", "b", "c"], ) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_frame_equal(result, expected) - # axis = 1 - result = df.quantile([0.25, 0.5], axis=1) + def test_quantile_multi_axis_1(self, interp_method): + interpolation, method = interp_method + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile( + [0.25, 0.5], axis=1, interpolation=interpolation, method=method + ) expected = DataFrame( [[1.0, 2.0, 3.0]] * 2, index=[0.25, 0.5], columns=[0, 1, 2] ) + if interpolation == "nearest": + expected = expected.astype(np.int64) tm.assert_frame_equal(result, expected) - # empty - result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0) + def test_quantile_multi_empty(self, interp_method): + interpolation, method = interp_method + result = DataFrame({"x": [], "y": []}).quantile( + [0.1, 0.9], axis=0, interpolation=interpolation, method=method + ) expected = DataFrame( {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] ) @@ -326,26 +395,37 @@ def test_quantile_datetime(self): "Period[D]", ], ) - def test_quantile_dt64_empty(self, dtype): + def test_quantile_dt64_empty(self, dtype, interp_method): # GH#41544 + interpolation, method = interp_method df = DataFrame(columns=["a", "b"], dtype=dtype) - res = df.quantile(0.5, axis=1, numeric_only=False) + res = df.quantile( + 0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method + ) expected = Series([], index=[], name=0.5, dtype=dtype) tm.assert_series_equal(res, expected) # no columns in result, so no dtype preservation - res = df.quantile([0.5], axis=1, numeric_only=False) + res = df.quantile( + [0.5], + axis=1, + numeric_only=False, + interpolation=interpolation, + method=method, + ) expected = DataFrame(index=[0.5]) tm.assert_frame_equal(res, expected) - def test_quantile_invalid(self, datetime_frame): + @pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]]) + def test_quantile_invalid(self, invalid, datetime_frame, interp_method): msg = "percentiles should all be in the interval \\[0, 1\\]" - for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: - with pytest.raises(ValueError, match=msg): - datetime_frame.quantile(invalid) + interpolation, method = interp_method + with pytest.raises(ValueError, match=msg): + datetime_frame.quantile(invalid, interpolation=interpolation, method=method) - def test_quantile_box(self): + def test_quantile_box(self, interp_method): + interpolation, method = interp_method df = DataFrame( { "A": [ @@ -366,7 +446,9 @@ def test_quantile_box(self): } ) - res = df.quantile(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) exp = Series( [ @@ -379,7 +461,9 @@ def test_quantile_box(self): ) tm.assert_series_equal(res, exp) - res = df.quantile([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation=interpolation, method=method + ) exp = DataFrame( [ [ @@ -393,6 +477,7 @@ def test_quantile_box(self): ) tm.assert_frame_equal(res, exp) + def test_quantile_box_nat(self): # DatetimeLikeBlock may be consolidated and contain NaT in different loc df = DataFrame( { @@ -468,49 +553,67 @@ def test_quantile_box(self): ) tm.assert_frame_equal(res, exp) - def test_quantile_nan(self): + def test_quantile_nan(self, interp_method): + interpolation, method = interp_method # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan - res = df.quantile(0.5) - exp = Series([3.0, 2.5], index=["a", "b"], name=0.5) + res = df.quantile(0.5, interpolation=interpolation, method=method) + exp = Series( + [3.0, 2.5 if interpolation == "linear" else 3.0], index=["a", "b"], name=0.5 + ) tm.assert_series_equal(res, exp) - res = df.quantile([0.5, 0.75]) - exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75]) + res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method) + exp = DataFrame( + { + "a": [3.0, 4.0], + "b": [2.5, 3.25] if interpolation == "linear" else [3.0, 4.0], + }, + index=[0.5, 0.75], + ) tm.assert_frame_equal(res, exp) - res = df.quantile(0.5, axis=1) + res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) exp = Series(np.arange(1.0, 6.0), name=0.5) tm.assert_series_equal(res, exp) - res = df.quantile([0.5, 0.75], axis=1) + res = df.quantile( + [0.5, 0.75], axis=1, interpolation=interpolation, method=method + ) exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + if interpolation == "nearest": + exp.iloc[1, -1] = np.nan tm.assert_frame_equal(res, exp) # full-nan column df["b"] = np.nan - res = df.quantile(0.5) + res = df.quantile(0.5, interpolation=interpolation, method=method) exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantile([0.5, 0.75]) + res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method) exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - def test_quantile_nat(self): + def test_quantile_nat(self, interp_method): + interpolation, method = interp_method # full NaT column df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) - res = df.quantile(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) exp = Series([pd.NaT], index=["a"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantile([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation=interpolation, method=method + ) exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) tm.assert_frame_equal(res, exp) @@ -526,50 +629,57 @@ def test_quantile_nat(self): } ) - res = df.quantile(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantile([0.5], numeric_only=False) + res = df.quantile( + [0.5], numeric_only=False, interpolation=interpolation, method=method + ) exp = DataFrame( [[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] ) tm.assert_frame_equal(res, exp) - def test_quantile_empty_no_rows_floats(self): + def test_quantile_empty_no_rows_floats(self, interp_method): + interpolation, method = interp_method - # floats df = DataFrame(columns=["a", "b"], dtype="float64") - res = df.quantile(0.5) + res = df.quantile(0.5, interpolation=interpolation, method=method) exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - res = df.quantile([0.5]) + res = df.quantile([0.5], interpolation=interpolation, method=method) exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) tm.assert_frame_equal(res, exp) - res = df.quantile(0.5, axis=1) + res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) exp = Series([], index=[], dtype="float64", name=0.5) tm.assert_series_equal(res, exp) - res = df.quantile([0.5], axis=1) + res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method) exp = DataFrame(columns=[], index=[0.5]) tm.assert_frame_equal(res, exp) - def test_quantile_empty_no_rows_ints(self): - # ints + def test_quantile_empty_no_rows_ints(self, interp_method): + interpolation, method = interp_method df = DataFrame(columns=["a", "b"], dtype="int64") - res = df.quantile(0.5) + res = df.quantile(0.5, interpolation=interpolation, method=method) exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - def test_quantile_empty_no_rows_dt64(self): + def test_quantile_empty_no_rows_dt64(self, interp_method): + interpolation, method = interp_method # datetimes df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") - res = df.quantile(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) exp = Series( [pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5 ) @@ -577,627 +687,49 @@ def test_quantile_empty_no_rows_dt64(self): # Mixed dt64/dt64tz df["a"] = df["a"].dt.tz_localize("US/Central") - res = df.quantile(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) exp = exp.astype(object) tm.assert_series_equal(res, exp) # both dt64tz df["b"] = df["b"].dt.tz_localize("US/Central") - res = df.quantile(0.5, numeric_only=False) + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) exp = exp.astype(df["b"].dtype) tm.assert_series_equal(res, exp) - def test_quantile_empty_no_columns(self): + def test_quantile_empty_no_columns(self, interp_method): # GH#23925 _get_numeric_data may drop all columns + interpolation, method = interp_method df = DataFrame(pd.date_range("1/1/18", periods=5)) df.columns.name = "captain tightpants" - result = df.quantile(0.5, numeric_only=True) - expected = Series([], index=[], name=0.5, dtype=np.float64) - expected.index.name = "captain tightpants" - tm.assert_series_equal(result, expected) - - result = df.quantile([0.5], numeric_only=True) - expected = DataFrame([], index=[0.5], columns=[]) - expected.columns.name = "captain tightpants" - tm.assert_frame_equal(result, expected) - - def test_quantile_item_cache(self, using_array_manager): - # previous behavior incorrect retained an invalid _item_cache entry - df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) - df["D"] = df["A"] * 2 - ser = df["A"] - if not using_array_manager: - assert len(df._mgr.blocks) == 2 - - df.quantile(numeric_only=False) - ser.values[0] = 99 - - assert df.iloc[0, 0] == df["A"][0] - - -class TestDataFrameMultiQuantile: - @pytest.mark.parametrize( - "df,expected", - [ - [ - DataFrame( - { - 0: Series(pd.arrays.SparseArray([1, 2, 3])), - 1: Series(pd.arrays.SparseArray([4, 5, 6])), - } - ), - Series([2, 5], dtype="Sparse[int]", name=0.5), - ], - [ - DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), - Series([2.0], dtype="Sparse[float]", name=0.5), - ], - ], - ) - def test_quantile_sparse(self, df, expected): - # GH#17198 - # GH#24600 - result = df.quantile(interpolation="nearest", method="table") - - tm.assert_series_equal(result, expected) - - def test_quantile(self, datetime_frame): - from numpy import percentile - - df = datetime_frame - q = df.quantile(0.1, axis=0, interpolation="nearest", method="table") - assert ( - q["A"] - == percentile(df.to_records(index=False), 10, interpolation="nearest")[0] - ) - tm.assert_index_equal(q.index, df.columns) - - q = df.quantile(0.9, axis=1, interpolation="nearest", method="table") - assert ( - q["2000-01-03"] - == percentile(df.T.to_records(index=False), 90, interpolation="nearest")[0] - ) - tm.assert_index_equal(q.index, df.index) - - # test degenerate case - q = DataFrame({"x": [], "y": []}).quantile( - 0.1, axis=0, interpolation="nearest", method="table" - ) - assert np.isnan(q["x"]) and np.isnan(q["y"]) - - # non-numeric exclusion - df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) - result = df.quantile(0.5, interpolation="nearest", method="table") - expected = Series([3], index=["col2"], name=0.5) - tm.assert_series_equal(result, expected) - - # axis - df = DataFrame( - {"A": [1, 2, 3], "B": [2, 3, 4], "C": [4, 5, 6]}, index=[1, 2, 3] - ) - result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") - expected = Series([2, 3, 4], index=[1, 2, 3], name=0.5) - tm.assert_series_equal(result, expected) - - result = df.quantile( - [0.5, 0.75], axis=1, interpolation="nearest", method="table" - ) - expected = DataFrame({1: [2, 4], 2: [3, 5], 3: [4, 6]}, index=[0.5, 0.75]) - tm.assert_frame_equal(result, expected, check_index_type=True) - - # We may want to break API in the future to change this - # so that we exclude non-numeric along the same axis - # See GH #7312 - df = DataFrame([[1, 2, 3], ["a", "b", 4]]) - result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") - expected = Series([3, 4], index=[0, 1], name=0.5) - tm.assert_series_equal(result, expected) - - def test_quantile_date_range(self): - # GH 2460 - - dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") - ser = Series(dti) - df = DataFrame(ser) - - result = df.quantile( - numeric_only=False, interpolation="nearest", method="table" - ) - expected = Series(["2016-01-02"], name=0.5, dtype="datetime64[ns, US/Pacific]") - - tm.assert_series_equal(result, expected) - - def test_quantile_axis_mixed(self): - - # mixed on axis=1 - df = DataFrame( - { - "A": [1, 2, 3], - "B": [2.0, 3.0, 4.0], - "C": pd.date_range("20130101", periods=3), - "D": ["foo", "bar", "baz"], - } - ) - result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") - expected = Series([1.0, 2.0, 3.0], name=0.5) - tm.assert_series_equal(result, expected) - - # must raise - msg = "'values' is not ordered" - with pytest.raises(TypeError, match=msg): - df.quantile( - 0.5, axis=1, numeric_only=False, interpolation="nearest", method="table" - ) - - def test_quantile_axis_parameter(self): - # GH 9543/9544 - - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - - result = df.quantile(0.5, axis=0, interpolation="nearest", method="table") - - expected = Series([2, 3], index=["A", "B"], name=0.5) - tm.assert_series_equal(result, expected) - - expected = df.quantile( - 0.5, axis="index", interpolation="nearest", method="table" - ) - tm.assert_series_equal(result, expected) - - result = df.quantile(0.5, axis=1, interpolation="nearest", method="table") - - expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) - tm.assert_series_equal(result, expected) - result = df.quantile( - 0.5, axis="columns", interpolation="nearest", method="table" - ) - tm.assert_series_equal(result, expected) - - msg = "No axis named -1 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - df.quantile(0.1, axis=-1, interpolation="nearest", method="table") - msg = "No axis named column for object type DataFrame" - with pytest.raises(ValueError, match=msg): - df.quantile(0.1, axis="column", interpolation="nearest", method="table") - - def test_quantile_interpolation(self): - # see gh-10174 - - # interpolation method other than default nearest - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(0.5, axis=1, interpolation="higher", method="table") - expected = Series([2, 3, 4], index=[1, 2, 3], name=0.5) - tm.assert_series_equal(result, expected) - - # cross-check interpolation=higher results in original dtype - exp = np.percentile( - np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="higher" - ) - expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64") - tm.assert_series_equal(result, expected) - - # float - df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) - result = df.quantile(0.5, axis=1, interpolation="higher", method="table") - expected = Series([2.0, 3.0, 4.0], index=[1, 2, 3], name=0.5) - tm.assert_series_equal(result, expected) - exp = np.percentile( - np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), - 0.5, - axis=0, - interpolation="higher", + 0.5, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64") + expected = Series([], index=[], name=0.5, dtype=np.float64) + expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) - # axis - result = df.quantile([0.5, 0.75], axis=1, interpolation="lower", method="table") - expected = DataFrame( - {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] - ) - tm.assert_frame_equal(result, expected) - - # test degenerate case - df = DataFrame({"x": [], "y": []}) - q = df.quantile(0.1, axis=0, interpolation="higher", method="table") - assert np.isnan(q["x"]) and np.isnan(q["y"]) - - # multi - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) - result = df.quantile([0.25, 0.5], interpolation="lower", method="table") - - # https://github.com/numpy/numpy/issues/7163 - expected = DataFrame( - [[1, 1, 1], [2, 2, 2]], - index=[0.25, 0.5], - columns=["a", "b", "c"], - ) - tm.assert_frame_equal(result, expected) - - def test_quantile_interpolation_datetime(self, datetime_frame): - # see gh-10174 - - # interpolation = nearest (default case) - df = datetime_frame - q = df.quantile(0.1, axis=0, interpolation="nearest", method="table") - assert ( - q["A"] - == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] - ) - - def test_quantile_interpolation_int(self, int_frame): - # see gh-10174 - - df = int_frame - # interpolation = nearest (default case) - q = df.quantile(0.1, interpolation="nearest", method="table") - assert ( - q["A"] - == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] - ) - - # test with and without interpolation keyword - q1 = df.quantile(0.1, axis=0, interpolation="nearest", method="table") - assert ( - q1["A"] - == np.percentile(df.to_records(index=False), 10, interpolation="nearest")[0] - ) - tm.assert_series_equal(q, q1) - - def test_quantile_multi(self): - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) - result = df.quantile([0.25, 0.5], interpolation="nearest", method="table") - expected = DataFrame( - [[1, 1, 1], [2, 2, 2]], - index=[0.25, 0.5], - columns=["a", "b", "c"], - ) - tm.assert_frame_equal(result, expected) - - # axis = 1 result = df.quantile( - [0.25, 0.5], axis=1, interpolation="nearest", method="table" + [0.5], numeric_only=True, interpolation=interpolation, method=method ) - expected = DataFrame( - [[1, 2, 3], [1, 2, 3]], index=[0.25, 0.5], columns=[0, 1, 2] - ) - tm.assert_frame_equal(result, expected) - - # empty - result = DataFrame({"x": [], "y": []}).quantile( - [0.1, 0.9], axis=0, interpolation="nearest", method="table" - ) - expected = DataFrame( - {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] - ) - tm.assert_frame_equal(result, expected) - - def test_quantile_datetime(self): - df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) - - # exclude datetime - result = df.quantile(0.5, interpolation="nearest", method="table") - expected = Series([0], index=["b"], name=0.5) - tm.assert_series_equal(result, expected) - - # datetime - result = df.quantile( - 0.5, numeric_only=False, interpolation="nearest", method="table" - ) - expected = Series([Timestamp("2010-01-01"), 0], index=["a", "b"], name=0.5) - tm.assert_series_equal(result, expected) - - # datetime w/ multi - result = df.quantile( - [0.5], numeric_only=False, interpolation="nearest", method="table" - ) - expected = DataFrame( - [[Timestamp("2010-01-01"), 0]], index=[0.5], columns=["a", "b"] - ) - tm.assert_frame_equal(result, expected) - - # axis = 1 - df["c"] = pd.to_datetime(["2011", "2012"]) - result = df[["a", "c"]].quantile( - 0.5, axis=1, numeric_only=False, interpolation="nearest", method="table" - ) - expected = Series( - [Timestamp("2010-01-01"), Timestamp("2011-01-01")], - index=[0, 1], - name=0.5, - ) - tm.assert_series_equal(result, expected) - - result = df[["a", "c"]].quantile( - [0.5], axis=1, numeric_only=False, interpolation="nearest", method="table" - ) - expected = DataFrame( - [[Timestamp("2010-01-01"), Timestamp("2011-01-01")]], - index=[0.5], - columns=[0, 1], - ) - tm.assert_frame_equal(result, expected) - - # empty when numeric_only=True - result = df[["a", "c"]].quantile(0.5, interpolation="nearest", method="table") - expected = Series([], index=[], name=0.5) - tm.assert_series_equal(result, expected) - - result = df[["a", "c"]].quantile([0.5], interpolation="nearest", method="table") - expected = DataFrame([], index=[0.5]) - tm.assert_frame_equal(result, expected) - - def test_quantile_invalid(self, datetime_frame): - msg = "percentiles should all be in the interval \\[0, 1\\]" - for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: - with pytest.raises(ValueError, match=msg): - datetime_frame.quantile( - invalid, interpolation="nearest", method="table" - ) - - def test_quantile_box(self): - df = DataFrame( - { - "A": [ - Timestamp("2011-01-01"), - Timestamp("2011-01-02"), - Timestamp("2011-01-03"), - ], - "B": [ - Timestamp("2011-01-01", tz="US/Eastern"), - Timestamp("2011-01-02", tz="US/Eastern"), - Timestamp("2011-01-03", tz="US/Eastern"), - ], - "C": [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - ], - } - ) - - res = df.quantile( - 0.5, numeric_only=False, interpolation="nearest", method="table" - ) - - exp = Series( - [ - Timestamp("2011-01-02"), - Timestamp("2011-01-02", tz="US/Eastern"), - pd.Timedelta("2 days"), - ], - name=0.5, - index=["A", "B", "C"], - ) - tm.assert_series_equal(res, exp) - - res = df.quantile( - [0.5], numeric_only=False, interpolation="nearest", method="table" - ) - exp = DataFrame( - [ - [ - Timestamp("2011-01-02"), - Timestamp("2011-01-02", tz="US/Eastern"), - pd.Timedelta("2 days"), - ] - ], - index=[0.5], - columns=["A", "B", "C"], - ) - tm.assert_frame_equal(res, exp) - - # DatetimeLikeBlock may be consolidated and contain NaT in different loc - df = DataFrame( - { - "A": [ - Timestamp("2011-01-01"), - pd.NaT, - Timestamp("2011-01-02"), - Timestamp("2011-01-03"), - ], - "a": [ - Timestamp("2011-01-01"), - Timestamp("2011-01-02"), - pd.NaT, - Timestamp("2011-01-03"), - ], - "B": [ - Timestamp("2011-01-01", tz="US/Eastern"), - pd.NaT, - Timestamp("2011-01-02", tz="US/Eastern"), - Timestamp("2011-01-03", tz="US/Eastern"), - ], - "b": [ - Timestamp("2011-01-01", tz="US/Eastern"), - Timestamp("2011-01-02", tz="US/Eastern"), - pd.NaT, - Timestamp("2011-01-03", tz="US/Eastern"), - ], - "C": [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - pd.NaT, - ], - "c": [ - pd.NaT, - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - ], - }, - columns=list("AaBbCc"), - ) - - res = df.quantile( - 0.5, numeric_only=False, interpolation="nearest", method="table" - ) - exp = Series( - [ - Timestamp("2011-01-03"), - Timestamp("2011-01-03"), - Timestamp("2011-01-03", tz="US/Eastern"), - Timestamp("2011-01-03", tz="US/Eastern"), - pd.NaT, - pd.Timedelta("3 days"), - ], - name=0.5, - index=list("AaBbCc"), - ) - tm.assert_series_equal(res, exp) - - res = df.quantile( - [0.5], numeric_only=False, interpolation="nearest", method="table" - ) - exp = DataFrame( - [ - [ - Timestamp("2011-01-03"), - Timestamp("2011-01-03"), - Timestamp("2011-01-03", tz="US/Eastern"), - Timestamp("2011-01-03", tz="US/Eastern"), - np.timedelta64("NaT", "ns"), - pd.Timedelta("3 days"), - ] - ], - index=[0.5], - columns=list("AaBbCc"), - ) - tm.assert_frame_equal(res, exp) - - def test_quantile_nan(self): - - # GH 14357 - float block where some cols have missing values - df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) - df.iloc[-1, 1] = np.nan - - res = df.quantile(0.5, interpolation="nearest", method="table") - exp = Series([3.0, 3.0], index=["a", "b"], name=0.5) - tm.assert_series_equal(res, exp) - - res = df.quantile([0.5, 0.75], interpolation="nearest", method="table") - exp = DataFrame({"a": [3.0, 4.0], "b": [3.0, 4.0]}, index=[0.5, 0.75]) - tm.assert_frame_equal(res, exp) - - res = df.quantile(0.5, axis=1, interpolation="nearest", method="table") - exp = Series(np.arange(1.0, 6.0), name=0.5) - tm.assert_series_equal(res, exp) - - res = df.quantile([0.5, 0.75], axis=1, interpolation="nearest", method="table") - exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) - exp.iloc[1, -1] = np.nan - tm.assert_frame_equal(res, exp) - - # full-nan column - df["b"] = np.nan - - res = df.quantile(0.5, interpolation="nearest", method="table") - exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) - tm.assert_series_equal(res, exp) - - res = df.quantile([0.5, 0.75], interpolation="nearest", method="table") - exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) - tm.assert_frame_equal(res, exp) - - def test_quantile_nat(self): - - # full NaT column - df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) - - res = df.quantile( - 0.5, numeric_only=False, interpolation="nearest", method="table" - ) - exp = Series([pd.NaT], index=["a"], name=0.5) - tm.assert_series_equal(res, exp) - - res = df.quantile( - [0.5], numeric_only=False, interpolation="nearest", method="table" - ) - exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) - tm.assert_frame_equal(res, exp) - - # mixed non-null / full null column - df = DataFrame( - { - "a": [ - Timestamp("2012-01-01"), - Timestamp("2012-01-02"), - Timestamp("2012-01-03"), - ], - "b": [pd.NaT, pd.NaT, pd.NaT], - } - ) - - res = df.quantile( - 0.5, numeric_only=False, interpolation="nearest", method="table" - ) - exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) - tm.assert_series_equal(res, exp) - - res = df.quantile( - [0.5], numeric_only=False, interpolation="nearest", method="table" - ) - exp = DataFrame( - [[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] - ) - tm.assert_frame_equal(res, exp) - - def test_quantile_empty_no_rows(self): - - # floats - df = DataFrame(columns=["a", "b"], dtype="float64") - - res = df.quantile(0.5, interpolation="nearest", method="table") - exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) - tm.assert_series_equal(res, exp) - - res = df.quantile([0.5], interpolation="nearest", method="table") - exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) - tm.assert_frame_equal(res, exp) - - # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # res = df.quantile(0.5, axis=1, interpolation="nearest", method="table") - # res = df.quantile([0.5], axis=1, interpolation="nearest", method="table") - - # ints - df = DataFrame(columns=["a", "b"], dtype="int64") - - # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # res = df.quantile(0.5, interpolation="nearest", method="table") - - # datetimes - df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") - - # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) - # res = df.quantile( - # 0.5, numeric_only=False, interpolation="nearest", method="table" - # ) - - def test_quantile_empty_no_columns(self): - # GH#23925 _get_numeric_data may drop all columns - df = DataFrame(pd.date_range("1/1/18", periods=5)) - df.columns.name = "captain tightpants" - result = df.quantile(0.5, interpolation="nearest", method="table") - expected = Series([], index=[], name=0.5, dtype=np.float64) - expected.index.name = "captain tightpants" - tm.assert_series_equal(result, expected) - - result = df.quantile([0.5], interpolation="nearest", method="table") expected = DataFrame([], index=[0.5], columns=[]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) - def test_quantile_item_cache(self, using_array_manager): + def test_quantile_item_cache(self, using_array_manager, interp_method): # previous behavior incorrect retained an invalid _item_cache entry + interpolation, method = interp_method df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) df["D"] = df["A"] * 2 ser = df["A"] if not using_array_manager: assert len(df._mgr.blocks) == 2 - df.quantile(numeric_only=False, interpolation="nearest", method="table") + df.quantile(numeric_only=False, interpolation=interpolation, method=method) ser.values[0] = 99 assert df.iloc[0, 0] == df["A"][0] From 6fd8d49ac5fe90a499a32996dac4e60d413795ec Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jul 2022 16:19:11 -0700 Subject: [PATCH 16/22] Add tests validating arguments, remove unnecessary tolist() --- pandas/core/frame.py | 2 +- pandas/tests/frame/methods/test_quantile.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4bb10ce9031ed..74cd9167b09dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11312,7 +11312,7 @@ def quantile( np.arange(len(data)), q, **{np_percentile_argname: interpolation} ) - by = data.columns.tolist() + by = data.columns if len(by) > 1: keys = [data._get_label_or_level_values(x) for x in by] indexer = lexsort_indexer(keys) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 910c658b19806..82838ae5b6c2b 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -734,6 +734,14 @@ def test_quantile_item_cache(self, using_array_manager, interp_method): assert df.iloc[0, 0] == df["A"][0] + def test_invalid_method(self): + with pytest.raises(ValueError, match="Invalid method: foo"): + DataFrame(range(1)).quantile(0.5, method="foo") + + def test_table_invalid_interpolation(self): + with pytest.raises(ValueError, match="Invalid interpolation: foo"): + DataFrame(range(1)).quantile(0.5, method="table", interpolation="foo") + class TestQuantileExtensionDtype: # TODO: tests for axis=1? From 4ebab82c173d56c0a80df6623ebfd5b4490242ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jul 2022 16:24:57 -0700 Subject: [PATCH 17/22] Add whatsnew note --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d138ebb9c02a3..3320a8fb3c59d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -279,6 +279,7 @@ Other enhancements - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) - :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) +- :meth:`DataFrame.quantile` gained a ``method`` argument that can accept ``table`` to evaluate multi-column quantiles (:issue:`43881`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From eae90bc5325930dce361ec8d7b85e588edd25875 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Aug 2022 16:18:32 -0700 Subject: [PATCH 18/22] Add xfails for arraymanager --- pandas/tests/frame/methods/test_quantile.py | 76 +++++++++++++++++---- 1 file changed, 62 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 82838ae5b6c2b..e0cbd543f9809 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -30,7 +30,9 @@ class TestDataFrameQuantile: [DataFrame, Series, Timestamp], ], ) - def test_numeric_only_default_false_warning(self, non_num_col, interp_method): + def test_numeric_only_default_false_warning( + self, non_num_col, interp_method, request, using_array_manager + ): # GH #7308 interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}) @@ -43,6 +45,10 @@ def test_numeric_only_default_false_warning(self, non_num_col, interp_method): ) if interpolation == "nearest": expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) with tm.assert_produces_warning(FutureWarning, match="numeric_only"): result = df.quantile(0.5, interpolation=interpolation, method=method) tm.assert_series_equal(result, expected) @@ -96,7 +102,7 @@ def test_empty(self, interp_method): ) assert np.isnan(q["x"]) and np.isnan(q["y"]) - def test_non_numeric_exclusion(self, interp_method): + def test_non_numeric_exclusion(self, interp_method, request, using_array_manager): interpolation, method = interp_method df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile( @@ -106,9 +112,13 @@ def test_non_numeric_exclusion(self, interp_method): xp = df.median().rename(0.5) if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_series_equal(rs, xp) - def test_axis(self, interp_method): + def test_axis(self, interp_method, request, using_array_manager): # axis interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) @@ -116,6 +126,10 @@ def test_axis(self, interp_method): expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_series_equal(result, expected) result = df.quantile( @@ -130,7 +144,7 @@ def test_axis(self, interp_method): expected = expected.astype(np.int64) tm.assert_frame_equal(result, expected, check_index_type=True) - def test_axis_numeric_only_true(self, interp_method): + def test_axis_numeric_only_true(self, interp_method, request, using_array_manager): # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 @@ -142,9 +156,13 @@ def test_axis_numeric_only_true(self, interp_method): expected = Series([3.0, 4.0], index=[0, 1], name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_series_equal(result, expected) - def test_quantile_date_range(self, interp_method): + def test_quantile_date_range(self, interp_method, request, using_array_manager): # GH 2460 interpolation, method = interp_method dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") @@ -157,10 +175,14 @@ def test_quantile_date_range(self, interp_method): expected = Series( ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_series_equal(result, expected) - def test_quantile_axis_mixed(self, interp_method): + def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): # mixed on axis=1 interpolation, method = interp_method @@ -178,6 +200,10 @@ def test_quantile_axis_mixed(self, interp_method): expected = Series([1.5, 2.5, 3.5], name=0.5) if interpolation == "nearest": expected -= 0.5 + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_series_equal(result, expected) # must raise @@ -185,9 +211,13 @@ def test_quantile_axis_mixed(self, interp_method): with pytest.raises(TypeError, match=msg): df.quantile(0.5, axis=1, numeric_only=False) - def test_quantile_axis_parameter(self, interp_method): + def test_quantile_axis_parameter(self, interp_method, request, using_array_manager): # GH 9543/9544 interpolation, method = interp_method + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) @@ -301,7 +331,7 @@ def test_quantile_interpolation_int(self, int_frame): assert q1["A"] == np.percentile(df["A"], 10) tm.assert_series_equal(q, q1) - def test_quantile_multi(self, interp_method): + def test_quantile_multi(self, interp_method, request, using_array_manager): interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method) @@ -312,9 +342,13 @@ def test_quantile_multi(self, interp_method): ) if interpolation == "nearest": expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_frame_equal(result, expected) - def test_quantile_multi_axis_1(self, interp_method): + def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager): interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile( @@ -325,6 +359,10 @@ def test_quantile_multi_axis_1(self, interp_method): ) if interpolation == "nearest": expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) tm.assert_frame_equal(result, expected) def test_quantile_multi_empty(self, interp_method): @@ -424,8 +462,12 @@ def test_quantile_invalid(self, invalid, datetime_frame, interp_method): with pytest.raises(ValueError, match=msg): datetime_frame.quantile(invalid, interpolation=interpolation, method=method) - def test_quantile_box(self, interp_method): + def test_quantile_box(self, interp_method, request, using_array_manager): interpolation, method = interp_method + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) df = DataFrame( { "A": [ @@ -553,9 +595,12 @@ def test_quantile_box_nat(self): ) tm.assert_frame_equal(res, exp) - def test_quantile_nan(self, interp_method): + def test_quantile_nan(self, interp_method, request, using_array_manager): interpolation, method = interp_method - + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan @@ -599,9 +644,12 @@ def test_quantile_nan(self, interp_method): exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - def test_quantile_nat(self, interp_method): + def test_quantile_nat(self, interp_method, request, using_array_manager): interpolation, method = interp_method - + if method == "table" and using_array_manager: + request.node.add_marker( + pytest.mark.xfail(reason="Axis name incorrectly set.") + ) # full NaT column df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) From 250222b08611c15d50a923c8ead773ecbb3e6294 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 9 Aug 2022 12:41:29 -0700 Subject: [PATCH 19/22] Add ignores --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 48ab0f3c20eca..8468679f417aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11308,7 +11308,7 @@ def quantile( dtype = self.dtype return self._constructor([], index=q, columns=data.columns, dtype=dtype) - q_idx = np.quantile( + q_idx = np.quantile( # type: ignore[call-overload] np.arange(len(data)), q, **{np_percentile_argname: interpolation} ) @@ -11318,7 +11318,7 @@ def quantile( indexer = lexsort_indexer(keys) else: by = by[0] - k = data._get_label_or_level_values(by) + k = data._get_label_or_level_values(by) # type: ignore[arg-type] indexer = nargsort(k) res = data._mgr.take(indexer[q_idx], verify=False) From a5977dc16b5619d965776e919156aeff8dc7e844 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 10 Aug 2022 15:00:43 -0700 Subject: [PATCH 20/22] Improve assertin of test_quantile --- pandas/tests/frame/methods/test_quantile.py | 32 +++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index e2044ec85f0a5..88b06caa775cf 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -84,19 +84,35 @@ def test_quantile_sparse(self, df, expected): def test_quantile(self, datetime_frame, interp_method): interpolation, method = interp_method df = datetime_frame - q = df.quantile( + result = df.quantile( 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method ) - if method == "single": - assert q["A"] == np.percentile(df["A"], 10) - tm.assert_index_equal(q.index, df.columns) + expected = Series( + [np.percentile(df[col], 10) for col in df.columns], + index=df.columns, + name=0.1, + ) + if interpolation == "linear": + # np.percentile values only comparable to linear interpolation + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result.index, expected.index) + assert result.name == expected.name - q = df.quantile( + result = df.quantile( 0.9, axis=1, numeric_only=True, interpolation=interpolation, method=method ) - if method == "single": - assert q["2000-01-17"] == np.percentile(df.loc["2000-01-17"], 90) - tm.assert_index_equal(q.index, df.index) + expected = Series( + [np.percentile(df.loc[date], 90) for date in df.index], + index=df.index, + name=0.9, + ) + if interpolation == "linear": + # np.percentile values only comparable to linear interpolation + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result.index, expected.index) + assert result.name == expected.name def test_empty(self, interp_method): interpolation, method = interp_method From 90de88e329a35143f876b2ea11ff02b8bdff46dd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 10 Aug 2022 17:52:29 -0700 Subject: [PATCH 21/22] Add xfail marker for arraymanager --- pandas/tests/frame/methods/test_quantile.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 88b06caa775cf..14b416011b956 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -81,7 +81,9 @@ def test_quantile_sparse(self, df, expected): tm.assert_series_equal(result, expected) - def test_quantile(self, datetime_frame, interp_method): + def test_quantile( + self, datetime_frame, interp_method, using_array_manager, request + ): interpolation, method = interp_method df = datetime_frame result = df.quantile( @@ -97,6 +99,11 @@ def test_quantile(self, datetime_frame, interp_method): tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) + request.node.add_marker( + pytest.mark.xfail( + using_array_manager, reason="Name set incorrectly for arraymanager" + ) + ) assert result.name == expected.name result = df.quantile( @@ -112,6 +119,11 @@ def test_quantile(self, datetime_frame, interp_method): tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) + request.node.add_marker( + pytest.mark.xfail( + using_array_manager, reason="Name set incorrectly for arraymanager" + ) + ) assert result.name == expected.name def test_empty(self, interp_method): From 016f81b35bffea981e0d9a830c9b1a308fdda666 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 15 Aug 2022 17:10:03 -0700 Subject: [PATCH 22/22] Fix typing again --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b54aad80f469c..74d4184fe985d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11231,8 +11231,8 @@ def quantile( # error: List item 0 has incompatible type "Union[float, Union[Union[ # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; # expected "float" - res_df = self.quantile( - [q], # type: ignore[list-item] + res_df = self.quantile( # type: ignore[call-overload] + [q], axis=axis, numeric_only=numeric_only, interpolation=interpolation,