Skip to content

Commit be2c977

Browse files
committed
BUG: DataFrame constructor defaulting to float dtype on empty input
1 parent 34177d6 commit be2c977

21 files changed

+66
-48
lines changed

pandas/core/construction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -652,8 +652,8 @@ def sanitize_array(
652652
data = list(data)
653653

654654
if len(data) == 0 and dtype is None:
655-
# We default to float64, matching numpy
656-
subarr = np.array([], dtype=np.float64)
655+
# We default to object, diverging from NumPy
656+
subarr = np.array([], dtype=np.object_)
657657

658658
elif dtype is not None:
659659
subarr = _try_cast(data, dtype, copy)

pandas/core/frame.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13059,16 +13059,14 @@ def quantile(
1305913059
interpolation=interpolation,
1306013060
method=method,
1306113061
)
13062-
if method == "single":
13063-
res = res_df.iloc[0]
13064-
else:
13065-
# cannot directly iloc over sparse arrays
13066-
res = res_df.T.iloc[:, 0]
13062+
res = res_df.iloc[0]
1306713063
if axis == 1 and len(self) == 0:
1306813064
# GH#41544 try to get an appropriate dtype
13069-
dtype = find_common_type(list(self.dtypes))
13070-
if needs_i8_conversion(dtype):
13071-
return res.astype(dtype)
13065+
dtype = "float64"
13066+
cdtype = find_common_type(list(self.dtypes))
13067+
if needs_i8_conversion(cdtype):
13068+
dtype = cdtype
13069+
return res.astype(dtype)
1307213070
return res
1307313071

1307413072
q = Index(q, dtype=np.float64)

pandas/core/groupby/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ def _transform_general(
578578
concatenated = concat(results, ignore_index=True)
579579
result = self._set_result_index_ordered(concatenated)
580580
else:
581-
result = self.obj._constructor(dtype=np.float64)
581+
result = self.obj._constructor(dtype=self.obj.dtype)
582582

583583
result.name = self.obj.name
584584
return result

pandas/core/internals/managers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1778,7 +1778,7 @@ def as_array(
17781778
passed_nan = lib.is_float(na_value) and isna(na_value)
17791779

17801780
if len(self.blocks) == 0:
1781-
arr = np.empty(self.shape, dtype=float)
1781+
arr = np.empty(self.shape, dtype=object)
17821782
return arr.transpose()
17831783

17841784
if self.is_single_block:

pandas/tests/arrays/categorical/test_missing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ def test_compare_categorical_with_missing(self, a1, a2, categories):
122122
"na_value, dtype",
123123
[
124124
(pd.NaT, "datetime64[ns]"),
125-
(None, "float64"),
125+
(None, "object"),
126126
(np.nan, "float64"),
127-
(pd.NA, "float64"),
127+
(pd.NA, "object"),
128128
],
129129
)
130130
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):

pandas/tests/frame/methods/test_quantile.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def test_quantile(self, datetime_frame, interp_method, request):
8181
def test_empty(self, interp_method):
8282
interpolation, method = interp_method
8383
q = DataFrame({"x": [], "y": []}).quantile(
84-
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
84+
0.1, axis=0, interpolation=interpolation, method=method
8585
)
8686
assert np.isnan(q["x"]) and np.isnan(q["y"])
8787

@@ -319,8 +319,11 @@ def test_quantile_multi_empty(self, interp_method):
319319
result = DataFrame({"x": [], "y": []}).quantile(
320320
[0.1, 0.9], axis=0, interpolation=interpolation, method=method
321321
)
322+
dtype = "float64" if method == "single" else "object"
322323
expected = DataFrame(
323-
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
324+
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]},
325+
index=[0.1, 0.9],
326+
dtype=dtype,
324327
)
325328
tm.assert_frame_equal(result, expected)
326329

pandas/tests/frame/methods/test_reindex.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def test_setitem_reset_index_dtypes(self):
7777
df1["d"] = []
7878
result = df1.reset_index()
7979
expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype(
80-
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64}
80+
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.object_}
8181
)
8282
tm.assert_frame_equal(result, expected)
8383

pandas/tests/frame/test_reductions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1627,7 +1627,7 @@ def test_min_max_dt64_api_consistency_empty_df(self):
16271627
# check DataFrame/Series api consistency when calling min/max on an empty
16281628
# DataFrame/Series.
16291629
df = DataFrame({"x": []})
1630-
expected_float_series = Series([], dtype=float)
1630+
expected_float_series = Series([], dtype=object)
16311631
# check axis 0
16321632
assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
16331633
assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())

pandas/tests/frame/test_stack_unstack.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,11 +1418,12 @@ def test_stack_timezone_aware_values(future_stack):
14181418
def test_stack_empty_frame(dropna, future_stack):
14191419
# GH 36113
14201420
levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
1421-
expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
1421+
expected = Series(dtype=np.object_, index=MultiIndex(levels=levels, codes=[[], []]))
14221422
if future_stack and dropna is not lib.no_default:
14231423
with pytest.raises(ValueError, match="dropna must be unspecified"):
14241424
DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
14251425
else:
1426+
# dtype=np.float64 is lost since there are no columns
14261427
result = DataFrame(dtype=np.float64).stack(
14271428
dropna=dropna, future_stack=future_stack
14281429
)
@@ -1612,7 +1613,9 @@ def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
16121613
(
16131614
[[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
16141615
["ix1", "ix2", "col1", "col2", "col3"],
1615-
None,
1616+
# Nones are used as floats in the presence of numeric data,
1617+
# resulting in np.nan for index level 1.
1618+
np.nan,
16161619
[None, None, 30.0],
16171620
),
16181621
],
@@ -1624,10 +1627,12 @@ def test_unstack_partial(
16241627
# https://github.com/pandas-dev/pandas/issues/19351
16251628
# make sure DataFrame.unstack() works when its run on a subset of the DataFrame
16261629
# and the Index levels contain values that are not present in the subset
1627-
result = DataFrame(result_rows, columns=result_columns).set_index(
1628-
["ix1", "ix2"]
1630+
data = (
1631+
DataFrame(result_rows, columns=result_columns)
1632+
.set_index(["ix1", "ix2"])
1633+
.iloc[1:2]
16291634
)
1630-
result = result.iloc[1:2].unstack("ix2")
1635+
result = data.unstack("ix2")
16311636
expected = DataFrame(
16321637
[expected_row],
16331638
columns=MultiIndex.from_product(

pandas/tests/groupby/methods/test_quantile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def test_quantile_missing_group_values_no_segfaults():
192192
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
193193
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
194194
([0], [42], [0], [42.0]),
195-
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
195+
([], np.array([], dtype="float64"), [], np.array([], dtype="float64")),
196196
],
197197
)
198198
def test_quantile_missing_group_values_correct_results(

pandas/tests/groupby/test_apply.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,9 +1479,7 @@ def test_empty_df(method, op):
14791479
group = getattr(gb, "b")
14801480

14811481
result = getattr(group, method)(op)
1482-
expected = Series(
1483-
[], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
1484-
)
1482+
expected = Series([], name="b", index=Index([], name="a"))
14851483

14861484
tm.assert_series_equal(result, expected)
14871485

pandas/tests/groupby/test_groupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,10 +1116,10 @@ def convert_force_pure(x):
11161116
def test_groupby_dtype_inference_empty():
11171117
# GH 6733
11181118
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
1119-
assert df["x"].dtype == np.float64
1119+
assert df["x"].dtype == np.object_
11201120

11211121
result = df.groupby("x").first()
1122-
exp_index = Index([], name="x", dtype=np.float64)
1122+
exp_index = Index([], name="x", dtype=np.object_)
11231123
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
11241124
tm.assert_frame_equal(result, expected, by_blocks=True)
11251125

pandas/tests/groupby/test_grouping.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -739,19 +739,15 @@ def test_list_grouper_with_nat(self):
739739
[
740740
(
741741
"transform",
742-
Series(name=2, dtype=np.float64),
742+
Series(name=2),
743743
),
744744
(
745745
"agg",
746-
Series(
747-
name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
748-
),
746+
Series(name=2, index=Index([], name=1)),
749747
),
750748
(
751749
"apply",
752-
Series(
753-
name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
754-
),
750+
Series(name=2, index=Index([], name=1)),
755751
),
756752
],
757753
)

pandas/tests/indexing/test_partial.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ def test_partial_set_empty_frame3(self):
119119
expected = DataFrame(
120120
columns=Index(["foo"], dtype=object), index=Index([], dtype="int64")
121121
)
122-
expected["foo"] = expected["foo"].astype("float64")
123122

124123
df = DataFrame(index=Index([], dtype="int64"))
125124
df["foo"] = []
@@ -128,6 +127,11 @@ def test_partial_set_empty_frame3(self):
128127

129128
df = DataFrame(index=Index([], dtype="int64"))
130129
df["foo"] = Series(np.arange(len(df)), dtype="float64")
130+
expected = DataFrame(
131+
columns=Index(["foo"], dtype=object),
132+
index=Index([], dtype="int64"),
133+
dtype="float64",
134+
)
131135

132136
tm.assert_frame_equal(df, expected)
133137

pandas/tests/resample/test_datetime_index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2009,7 +2009,7 @@ def test_resample_empty_series_with_tz():
20092009
expected_idx = DatetimeIndex(
20102010
[], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]"
20112011
)
2012-
expected = Series([], index=expected_idx, name="values", dtype="float64")
2012+
expected = Series([], index=expected_idx, name="values")
20132013
tm.assert_series_equal(result, expected)
20142014

20152015

pandas/tests/reshape/concat/test_concat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ def test_concat_empty_and_non_empty_frame_regression():
572572
# GH 18178 regression test
573573
df1 = DataFrame({"foo": [1]})
574574
df2 = DataFrame({"foo": []})
575-
expected = DataFrame({"foo": [1.0]})
575+
expected = DataFrame({"foo": [1]}, dtype="object")
576576
result = concat([df1, df2])
577577
tm.assert_frame_equal(result, expected)
578578

pandas/tests/reshape/concat/test_empty.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def test_concat_empty_series_timelike(self, tz, values):
9090
expected = DataFrame(
9191
{
9292
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
93-
1: values,
93+
1: Series(values, dtype=dtype),
9494
}
9595
)
9696
result = concat([first, second], axis=1)

pandas/tests/reshape/test_melt.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,14 @@ def test_invalid_separator(self):
924924
"A": [],
925925
"B": [],
926926
}
927-
expected = DataFrame(exp_data).astype({"year": np.int64})
927+
expected = DataFrame(exp_data).astype(
928+
{
929+
"A2010": np.float64,
930+
"A2011": np.float64,
931+
"B2010": np.float64,
932+
"year": np.int64,
933+
}
934+
)
928935
expected = expected.set_index(["id", "year"])[
929936
["X", "A2010", "A2011", "B2010", "A", "B"]
930937
]
@@ -987,7 +994,14 @@ def test_invalid_suffixtype(self):
987994
"A": [],
988995
"B": [],
989996
}
990-
expected = DataFrame(exp_data).astype({"year": np.int64})
997+
expected = DataFrame(exp_data).astype(
998+
{
999+
"Aone": np.float64,
1000+
"Atwo": np.float64,
1001+
"Bone": np.float64,
1002+
"year": np.int64,
1003+
}
1004+
)
9911005

9921006
expected = expected.set_index(["id", "year"])
9931007
expected.index = expected.index.set_levels([0, 1], level=0)
@@ -1211,7 +1225,7 @@ def test_missing_stubname(self, dtype):
12111225
name=("id", "num"),
12121226
)
12131227
expected = DataFrame(
1214-
{"a": [100, 200, 300, 400], "b": [np.nan] * 4},
1228+
{"a": [100, 200, 300, 400], "b": pd.Series([np.nan] * 4, dtype="object")},
12151229
index=index,
12161230
)
12171231
new_level = expected.index.levels[0].astype(dtype)

pandas/tests/series/test_constructors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1416,7 +1416,7 @@ def test_constructor_dict_tuple_indexer(self):
14161416
data = {(1, 1, None): -1.0}
14171417
result = Series(data)
14181418
expected = Series(
1419-
-1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]])
1419+
-1.0, index=MultiIndex(levels=[[1], [1], []], codes=[[0], [0], [-1]])
14201420
)
14211421
tm.assert_series_equal(result, expected)
14221422

pandas/tests/window/test_groupby.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ def test_groupby_rolling_empty_frame(self):
549549
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
550550
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
551551
expected.index = MultiIndex.from_product(
552-
[Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
552+
[Index([]), Index([], dtype="int64")], names=["s1", None]
553553
)
554554
tm.assert_frame_equal(result, expected)
555555

@@ -559,8 +559,8 @@ def test_groupby_rolling_empty_frame(self):
559559
expected = expected.drop(columns=["s1", "s2"])
560560
expected.index = MultiIndex.from_product(
561561
[
562-
Index([], dtype="float64"),
563-
Index([], dtype="float64"),
562+
Index([]),
563+
Index([]),
564564
Index([], dtype="int64"),
565565
],
566566
names=["s1", "s2", None],

pandas/tests/window/test_timeseries_window.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def test_rolling_on_empty(self):
671671
# GH-32385
672672
df = DataFrame({"column": []}, index=[])
673673
result = df.rolling("5s").min()
674-
expected = DataFrame({"column": []}, index=[])
674+
expected = DataFrame({"column": []}, index=[], dtype="float64")
675675
tm.assert_frame_equal(result, expected)
676676

677677
def test_rolling_on_multi_index_level(self):

0 commit comments

Comments
 (0)