From be2c9778dba88cf8ddc887b161dd886cc7aa5e5d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 8 May 2024 18:07:35 -0400 Subject: [PATCH 1/4] BUG: DataFrame constructor defaulting to float dtype on empty input --- pandas/core/construction.py | 4 ++-- pandas/core/frame.py | 14 ++++++------- pandas/core/groupby/generic.py | 2 +- pandas/core/internals/managers.py | 2 +- .../tests/arrays/categorical/test_missing.py | 4 ++-- pandas/tests/frame/methods/test_quantile.py | 7 +++++-- pandas/tests/frame/methods/test_reindex.py | 2 +- pandas/tests/frame/test_reductions.py | 2 +- pandas/tests/frame/test_stack_unstack.py | 15 +++++++++----- pandas/tests/groupby/methods/test_quantile.py | 2 +- pandas/tests/groupby/test_apply.py | 4 +--- pandas/tests/groupby/test_groupby.py | 4 ++-- pandas/tests/groupby/test_grouping.py | 10 +++------- pandas/tests/indexing/test_partial.py | 6 +++++- pandas/tests/resample/test_datetime_index.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 2 +- pandas/tests/reshape/concat/test_empty.py | 2 +- pandas/tests/reshape/test_melt.py | 20 ++++++++++++++++--- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/window/test_groupby.py | 6 +++--- pandas/tests/window/test_timeseries_window.py | 2 +- 21 files changed, 66 insertions(+), 48 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2718e9819cdf8..2a21492bde2f4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -652,8 +652,8 @@ def sanitize_array( data = list(data) if len(data) == 0 and dtype is None: - # We default to float64, matching numpy - subarr = np.array([], dtype=np.float64) + # We default to object, diverging from NumPy + subarr = np.array([], dtype=np.object_) elif dtype is not None: subarr = _try_cast(data, dtype, copy) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a4decab6e8a2b..c51c7f8a5e0fe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13059,16 +13059,14 @@ def quantile( interpolation=interpolation, method=method, ) - if method == "single": - res = res_df.iloc[0] - else: - # cannot directly iloc over sparse arrays - res = res_df.T.iloc[:, 0] + res = res_df.iloc[0] if axis == 1 and len(self) == 0: # GH#41544 try to get an appropriate dtype - dtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(dtype): - return res.astype(dtype) + dtype = "float64" + cdtype = find_common_type(list(self.dtypes)) + if needs_i8_conversion(cdtype): + dtype = cdtype + return res.astype(dtype) return res q = Index(q, dtype=np.float64) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..6703b3dcfcbbd 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -578,7 +578,7 @@ def _transform_general( concatenated = concat(results, ignore_index=True) result = self._set_result_index_ordered(concatenated) else: - result = self.obj._constructor(dtype=np.float64) + result = self.obj._constructor(dtype=self.obj.dtype) result.name = self.obj.name return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7c1bcbec1d3f2..0cc9d549d7566 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1778,7 +1778,7 @@ def as_array( passed_nan = lib.is_float(na_value) and isna(na_value) if len(self.blocks) == 0: - arr = np.empty(self.shape, dtype=float) + arr = np.empty(self.shape, dtype=object) return arr.transpose() if self.is_single_block: diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 9d4b78ce9944e..4765cbd8d3097 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -122,9 +122,9 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): "na_value, dtype", [ (pd.NaT, "datetime64[ns]"), - (None, "float64"), + (None, "object"), (np.nan, "float64"), - (pd.NA, "float64"), + (pd.NA, "object"), ], ) def test_categorical_only_missing_values_no_cast(self, na_value, dtype): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 32ae4c0ff2f50..842d2c3a416d5 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -81,7 +81,7 @@ def test_quantile(self, datetime_frame, interp_method, request): def test_empty(self, interp_method): interpolation, method = interp_method q = DataFrame({"x": [], "y": []}).quantile( - 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method + 0.1, axis=0, interpolation=interpolation, method=method ) assert np.isnan(q["x"]) and np.isnan(q["y"]) @@ -319,8 +319,11 @@ def test_quantile_multi_empty(self, interp_method): result = DataFrame({"x": [], "y": []}).quantile( [0.1, 0.9], axis=0, interpolation=interpolation, method=method ) + dtype = "float64" if method == "single" else "object" expected = DataFrame( - {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] + {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, + index=[0.1, 0.9], + dtype=dtype, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 45109991c4553..3452e2796d16c 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -77,7 +77,7 @@ def test_setitem_reset_index_dtypes(self): df1["d"] = [] result = df1.reset_index() expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype( - {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64} + {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.object_} ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 5118561f67338..330c937729b55 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1627,7 +1627,7 @@ def test_min_max_dt64_api_consistency_empty_df(self): # check DataFrame/Series api consistency when calling min/max on an empty # DataFrame/Series. df = DataFrame({"x": []}) - expected_float_series = Series([], dtype=float) + expected_float_series = Series([], dtype=object) # check axis 0 assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 03db284d892e3..c45d46607982a 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1418,11 +1418,12 @@ def test_stack_timezone_aware_values(future_stack): def test_stack_empty_frame(dropna, future_stack): # GH 36113 levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] - expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) + expected = Series(dtype=np.object_, index=MultiIndex(levels=levels, codes=[[], []])) if future_stack and dropna is not lib.no_default: with pytest.raises(ValueError, match="dropna must be unspecified"): DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack) else: + # dtype=np.float64 is lost since there are no columns result = DataFrame(dtype=np.float64).stack( dropna=dropna, future_stack=future_stack ) @@ -1612,7 +1613,9 @@ def test_unstack(self, multiindex_year_month_day_dataframe_random_data): ( [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], ["ix1", "ix2", "col1", "col2", "col3"], - None, + # Nones are used as floats in the presence of numeric data, + # resulting in np.nan for index level 1. + np.nan, [None, None, 30.0], ), ], @@ -1624,10 +1627,12 @@ def test_unstack_partial( # https://github.com/pandas-dev/pandas/issues/19351 # make sure DataFrame.unstack() works when its run on a subset of the DataFrame # and the Index levels contain values that are not present in the subset - result = DataFrame(result_rows, columns=result_columns).set_index( - ["ix1", "ix2"] + data = ( + DataFrame(result_rows, columns=result_columns) + .set_index(["ix1", "ix2"]) + .iloc[1:2] ) - result = result.iloc[1:2].unstack("ix2") + result = data.unstack("ix2") expected = DataFrame( [expected_row], columns=MultiIndex.from_product( diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index af0deba138469..70816e7fd1da7 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -192,7 +192,7 @@ def test_quantile_missing_group_values_no_segfaults(): ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), ([0], [42], [0], [42.0]), - ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), + ([], np.array([], dtype="float64"), [], np.array([], dtype="float64")), ], ) def test_quantile_missing_group_values_correct_results( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e27c782c1bdcf..24ac7e8c4fa94 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1479,9 +1479,7 @@ def test_empty_df(method, op): group = getattr(gb, "b") result = getattr(group, method)(op) - expected = Series( - [], name="b", dtype="float64", index=Index([], dtype="float64", name="a") - ) + expected = Series([], name="b", index=Index([], name="a")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b99ef2a0e840d..a58e10d8005d1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1116,10 +1116,10 @@ def convert_force_pure(x): def test_groupby_dtype_inference_empty(): # GH 6733 df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) - assert df["x"].dtype == np.float64 + assert df["x"].dtype == np.object_ result = df.groupby("x").first() - exp_index = Index([], name="x", dtype=np.float64) + exp_index = Index([], name="x", dtype=np.object_) expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")}) tm.assert_frame_equal(result, expected, by_blocks=True) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 39eadd32f300d..3a62bb48ba5b7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -739,19 +739,15 @@ def test_list_grouper_with_nat(self): [ ( "transform", - Series(name=2, dtype=np.float64), + Series(name=2), ), ( "agg", - Series( - name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1) - ), + Series(name=2, index=Index([], name=1)), ), ( "apply", - Series( - name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1) - ), + Series(name=2, index=Index([], name=1)), ), ], ) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0a041ed5b69c..71cfa850cc855 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -119,7 +119,6 @@ def test_partial_set_empty_frame3(self): expected = DataFrame( columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") ) - expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) df["foo"] = [] @@ -128,6 +127,11 @@ def test_partial_set_empty_frame3(self): df = DataFrame(index=Index([], dtype="int64")) df["foo"] = Series(np.arange(len(df)), dtype="float64") + expected = DataFrame( + columns=Index(["foo"], dtype=object), + index=Index([], dtype="int64"), + dtype="float64", + ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5ee9b65ba9ae7..f57da2a8c6d27 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2009,7 +2009,7 @@ def test_resample_empty_series_with_tz(): expected_idx = DatetimeIndex( [], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]" ) - expected = Series([], index=expected_idx, name="values", dtype="float64") + expected = Series([], index=expected_idx, name="values") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index f86cc0c69d363..a51a128a3e7f0 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -572,7 +572,7 @@ def test_concat_empty_and_non_empty_frame_regression(): # GH 18178 regression test df1 = DataFrame({"foo": [1]}) df2 = DataFrame({"foo": []}) - expected = DataFrame({"foo": [1.0]}) + expected = DataFrame({"foo": [1]}, dtype="object") result = concat([df1, df2]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 06d57c48df817..ea02ceb2fcb30 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -90,7 +90,7 @@ def test_concat_empty_series_timelike(self, tz, values): expected = DataFrame( { 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), - 1: values, + 1: Series(values, dtype=dtype), } ) result = concat([first, second], axis=1) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index f224a45ca3279..21107e2d8fb58 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -924,7 +924,14 @@ def test_invalid_separator(self): "A": [], "B": [], } - expected = DataFrame(exp_data).astype({"year": np.int64}) + expected = DataFrame(exp_data).astype( + { + "A2010": np.float64, + "A2011": np.float64, + "B2010": np.float64, + "year": np.int64, + } + ) expected = expected.set_index(["id", "year"])[ ["X", "A2010", "A2011", "B2010", "A", "B"] ] @@ -987,7 +994,14 @@ def test_invalid_suffixtype(self): "A": [], "B": [], } - expected = DataFrame(exp_data).astype({"year": np.int64}) + expected = DataFrame(exp_data).astype( + { + "Aone": np.float64, + "Atwo": np.float64, + "Bone": np.float64, + "year": np.int64, + } + ) expected = expected.set_index(["id", "year"]) expected.index = expected.index.set_levels([0, 1], level=0) @@ -1211,7 +1225,7 @@ def test_missing_stubname(self, dtype): name=("id", "num"), ) expected = DataFrame( - {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, + {"a": [100, 200, 300, 400], "b": pd.Series([np.nan] * 4, dtype="object")}, index=index, ) new_level = expected.index.levels[0].astype(dtype) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..03c7cf4ec3ee8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1416,7 +1416,7 @@ def test_constructor_dict_tuple_indexer(self): data = {(1, 1, None): -1.0} result = Series(data) expected = Series( - -1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]]) + -1.0, index=MultiIndex(levels=[[1], [1], []], codes=[[0], [0], [-1]]) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 120470b09a92b..983d4f524f209 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -549,7 +549,7 @@ def test_groupby_rolling_empty_frame(self): # GH-38057 from_tuples gives empty object dtype, we now get float/int levels # expected.index = MultiIndex.from_tuples([], names=["s1", None]) expected.index = MultiIndex.from_product( - [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None] + [Index([]), Index([], dtype="int64")], names=["s1", None] ) tm.assert_frame_equal(result, expected) @@ -559,8 +559,8 @@ def test_groupby_rolling_empty_frame(self): expected = expected.drop(columns=["s1", "s2"]) expected.index = MultiIndex.from_product( [ - Index([], dtype="float64"), - Index([], dtype="float64"), + Index([]), + Index([]), Index([], dtype="int64"), ], names=["s1", "s2", None], diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 820b0134cc577..82fb5fda9ff06 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -671,7 +671,7 @@ def test_rolling_on_empty(self): # GH-32385 df = DataFrame({"column": []}, index=[]) result = df.rolling("5s").min() - expected = DataFrame({"column": []}, index=[]) + expected = DataFrame({"column": []}, index=[], dtype="float64") tm.assert_frame_equal(result, expected) def test_rolling_on_multi_index_level(self): From 9d249032c8b60676c3c206786a4b99b1b124af01 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 12 May 2024 09:27:31 -0400 Subject: [PATCH 2/4] cleanup --- pandas/tests/groupby/methods/test_quantile.py | 7 ++++++- pandas/tests/groupby/test_grouping.py | 15 +++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 70816e7fd1da7..a7e5cccab6b55 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -192,7 +192,12 @@ def test_quantile_missing_group_values_no_segfaults(): ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), ([0], [42], [0], [42.0]), - ([], np.array([], dtype="float64"), [], np.array([], dtype="float64")), + ( + np.array([], dtype="float64"), + np.array([], dtype="float64"), + np.array([], dtype="float64"), + np.array([], dtype="float64"), + ), ], ) def test_quantile_missing_group_values_correct_results( diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 3a62bb48ba5b7..a6416aea98591 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -737,18 +737,9 @@ def test_list_grouper_with_nat(self): @pytest.mark.parametrize( "func,expected", [ - ( - "transform", - Series(name=2), - ), - ( - "agg", - Series(name=2, index=Index([], name=1)), - ), - ( - "apply", - Series(name=2, index=Index([], name=1)), - ), + ("transform", Series(name=2)), + ("agg", Series(name=2, index=Index([], name=1))), + ("apply", Series(name=2, index=Index([], name=1))), ], ) def test_evaluate_with_empty_groups(self, func, expected): From cf562f0d808d8e733f72624cb216cae390855292 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 15 May 2024 13:16:37 -0400 Subject: [PATCH 3/4] WIP --- pandas/core/frame.py | 24 +++++++++++++-------- pandas/tests/frame/methods/test_quantile.py | 9 ++++---- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c51c7f8a5e0fe..31f5f1c356fd8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13045,6 +13045,7 @@ def quantile( C 1 days 12:00:00 Name: 0.5, dtype: object """ + from pandas.core.dtypes.common import is_object_dtype validate_percentile(q) axis = self._get_axis_number(axis) @@ -13064,7 +13065,7 @@ def quantile( # GH#41544 try to get an appropriate dtype dtype = "float64" cdtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(cdtype): + if needs_i8_conversion(cdtype) or is_object_dtype(cdtype): dtype = cdtype return res.astype(dtype) return res @@ -13083,7 +13084,7 @@ def quantile( if axis == 1: # GH#41544 try to get an appropriate dtype cdtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(cdtype): + if needs_i8_conversion(cdtype) or is_object_dtype(cdtype): dtype = cdtype res = self._constructor([], index=q, columns=cols, dtype=dtype) @@ -13094,6 +13095,18 @@ def quantile( raise ValueError( f"Invalid method: {method}. Method must be in {valid_method}." ) + + # handle degenerate case + if len(data) == 0: + dtype = np.float64 + if data.ndim == 2: + cdtype = find_common_type(list(self.dtypes)) + else: + cdtype = self.dtype + if needs_i8_conversion(cdtype) or is_object_dtype(cdtype): + dtype = cdtype + return self._constructor([], index=q, columns=data.columns, dtype=dtype) + if method == "single": res = data._mgr.quantile(qs=q, interpolation=interpolation) elif method == "table": @@ -13103,13 +13116,6 @@ def quantile( f"Invalid interpolation: {interpolation}. " f"Interpolation must be in {valid_interpolation}" ) - # handle degenerate case - if len(data) == 0: - if data.ndim == 2: - dtype = find_common_type(list(self.dtypes)) - else: - dtype = self.dtype - return self._constructor([], index=q, columns=data.columns, dtype=dtype) q_idx = np.quantile(np.arange(len(data)), q, method=interpolation) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 842d2c3a416d5..ff8758a9f9759 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -319,7 +319,8 @@ def test_quantile_multi_empty(self, interp_method): result = DataFrame({"x": [], "y": []}).quantile( [0.1, 0.9], axis=0, interpolation=interpolation, method=method ) - dtype = "float64" if method == "single" else "object" + # dtype = "float64" if method == "single" else "object" + dtype = "object" expected = DataFrame( {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9], @@ -692,7 +693,7 @@ def test_quantile_empty_no_rows_dt64(self, interp_method): 0.5, numeric_only=False, interpolation=interpolation, method=method ) exp = exp.astype(object) - if interpolation == "nearest": + if True or interpolation == "nearest": # GH#18463 TODO: would we prefer NaTs here? exp = exp.fillna(np.nan) tm.assert_series_equal(res, exp) @@ -911,7 +912,7 @@ def test_empty_datelike( @pytest.mark.parametrize( "expected_data, expected_index, axis", [ - [[np.nan, np.nan], range(2), 1], + [[pd.NaT, pd.NaT], range(2), 1], [[], [], 0], ], ) @@ -926,6 +927,6 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): ) result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True) expected = Series( - expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 + expected_data, name=0.5, index=Index(expected_index) ) tm.assert_series_equal(result, expected) From 088273bedca291f422cc49de57f6251e641bdbc0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 2 Jun 2024 11:33:35 -0400 Subject: [PATCH 4/4] Rework quantile --- pandas/core/frame.py | 25 ++++++++++++++------- pandas/tests/frame/methods/test_quantile.py | 14 +++++------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31f5f1c356fd8..1b3b04aae2a5e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -83,6 +83,7 @@ can_hold_element, construct_1d_arraylike_from_scalar, construct_2d_arraylike_from_scalar, + ensure_dtype_can_hold_na, find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, @@ -13046,6 +13047,7 @@ def quantile( Name: 0.5, dtype: object """ from pandas.core.dtypes.common import is_object_dtype + validate_percentile(q) axis = self._get_axis_number(axis) @@ -13075,6 +13077,10 @@ def quantile( if axis == 1: data = data.T + if data.shape[0] == 0: + # The transpose has no rows, so the original has no columns, meaning we + # have no dtype information. Since this is quantile, default to float64 + data = data.astype("float64") if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns @@ -13098,14 +13104,17 @@ def quantile( # handle degenerate case if len(data) == 0: - dtype = np.float64 - if data.ndim == 2: - cdtype = find_common_type(list(self.dtypes)) - else: - cdtype = self.dtype - if needs_i8_conversion(cdtype) or is_object_dtype(cdtype): - dtype = cdtype - return self._constructor([], index=q, columns=data.columns, dtype=dtype) + from pandas import array + + result = self._constructor( + { + idx: array(len(q) * [np.nan], dtype=ensure_dtype_can_hold_na(dtype)) + for idx, dtype in enumerate(data.dtypes) + }, + index=q, + ) + result.columns = data.columns + return result if method == "single": res = data._mgr.quantile(qs=q, interpolation=interpolation) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index ff8758a9f9759..6347a770f06f0 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -319,12 +319,10 @@ def test_quantile_multi_empty(self, interp_method): result = DataFrame({"x": [], "y": []}).quantile( [0.1, 0.9], axis=0, interpolation=interpolation, method=method ) - # dtype = "float64" if method == "single" else "object" - dtype = "object" expected = DataFrame( {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9], - dtype=dtype, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -692,10 +690,8 @@ def test_quantile_empty_no_rows_dt64(self, interp_method): res = df.quantile( 0.5, numeric_only=False, interpolation=interpolation, method=method ) - exp = exp.astype(object) - if True or interpolation == "nearest": - # GH#18463 TODO: would we prefer NaTs here? - exp = exp.fillna(np.nan) + # GH#18463 TODO: would we prefer NaTs here? + exp = exp.astype(object).fillna(pd.NaT) tm.assert_series_equal(res, exp) # both dt64tz @@ -912,7 +908,7 @@ def test_empty_datelike( @pytest.mark.parametrize( "expected_data, expected_index, axis", [ - [[pd.NaT, pd.NaT], range(2), 1], + [[np.nan, np.nan], range(2), 1], [[], [], 0], ], ) @@ -927,6 +923,6 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): ) result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True) expected = Series( - expected_data, name=0.5, index=Index(expected_index) + expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 ) tm.assert_series_equal(result, expected)