Skip to content

Commit 98e5400

Browse files
committed
POC: infer time objects to ArrowDtype[time]
1 parent 0e8c730 commit 98e5400

File tree

16 files changed

+264
-28
lines changed

16 files changed

+264
-28
lines changed

pandas/_libs/lib.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def map_infer(
7171
convert: bool = ...,
7272
ignore_na: bool = ...,
7373
) -> np.ndarray: ...
74+
7475
@overload
7576
def maybe_convert_objects(
7677
objects: npt.NDArray[np.object_],

pandas/_libs/lib.pyx

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from typing import (
66
Literal,
77
_GenericAlias,
88
)
9+
import warnings
910

1011
cimport cython
1112
from cpython.datetime cimport (
@@ -99,6 +100,8 @@ cdef extern from "pandas/parser/pd_parser.h":
99100

100101
PandasParser_IMPORT
101102

103+
from pandas._config import get_option
104+
102105
from pandas._libs cimport util
103106
from pandas._libs.util cimport (
104107
INT64_MAX,
@@ -1299,6 +1302,7 @@ cdef class Seen:
12991302
bint datetimetz_ # seen_datetimetz
13001303
bint period_ # seen_period
13011304
bint interval_ # seen_interval
1305+
bint time_
13021306

13031307
def __cinit__(self, bint coerce_numeric=False):
13041308
"""
@@ -1325,6 +1329,7 @@ cdef class Seen:
13251329
self.datetimetz_ = False
13261330
self.period_ = False
13271331
self.interval_ = False
1332+
self.time_ = False
13281333
self.coerce_numeric = coerce_numeric
13291334

13301335
cdef bint check_uint64_conflict(self) except -1:
@@ -2615,6 +2620,12 @@ def maybe_convert_objects(ndarray[object] objects,
26152620
else:
26162621
seen.object_ = True
26172622
break
2623+
elif PyTime_Check(val):
2624+
if convert_time:
2625+
seen.time_ = True
2626+
else:
2627+
seen.object_ = True
2628+
break
26182629
else:
26192630
seen.object_ = True
26202631
break
@@ -2679,7 +2690,36 @@ def maybe_convert_objects(ndarray[object] objects,
26792690

26802691
seen.object_ = True
26812692

2682-
elif seen.nat_:
2693+
elif seen.time_:
2694+
if is_time_array(objects):
2695+
opt = get_option("future.infer_time")
2696+
if opt is True:
2697+
import pyarrow as pa
2698+
2699+
from pandas.core.arrays.arrow import ArrowDtype
2700+
2701+
obj = pa.array(objects)
2702+
dtype = ArrowDtype(obj.type)
2703+
return dtype.construct_array_type()(obj)
2704+
elif opt is False:
2705+
# explicitly set to keep the old behavior and avoid the warning
2706+
pass
2707+
else:
2708+
from pandas.util._exceptions import find_stack_level
2709+
warnings.warn(
2710+
"Pandas type inference with a sequence of `datetime.time` "
2711+
"objects is deprecated. In a future version, this will give "
2712+
"time32[pyarrow] dtype, which will require pyarrow to be "
2713+
"installed. To opt in to the new behavior immediately set "
2714+
"`pd.set_option('future.infer_time', True)`. To keep the "
2715+
"old behavior pass `dtype=object`.",
2716+
FutureWarning,
2717+
stacklevel=find_stack_level(),
2718+
)
2719+
2720+
seen.object_ = True
2721+
2722+
if seen.nat_:
26832723
if not seen.object_ and not seen.numeric_ and not seen.bool_:
26842724
# all NaT, None, or nan (at least one NaT)
26852725
# see GH#49340 for discussion of desired behavior

pandas/core/config_init.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,3 +889,14 @@ def register_converter_cb(key) -> None:
889889
styler_environment,
890890
validator=is_instance_factory([type(None), str]),
891891
)
892+
893+
894+
with cf.config_prefix("future"):
895+
cf.register_option(
896+
"future.infer_time",
897+
None,
898+
"Whether to infer sequence of datetime.time objects as pyarrow time "
899+
"dtype, which will be the default in pandas 3.0 "
900+
"(at which point this option will be deprecated).",
901+
validator=is_one_of_factory([True, False, None]),
902+
)

pandas/core/construction.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import numpy as np
2020
from numpy import ma
2121

22+
from pandas._config import get_option
23+
2224
from pandas._libs import lib
2325
from pandas._libs.tslibs import (
2426
Period,
@@ -295,6 +297,7 @@ def array(
295297
PeriodArray,
296298
TimedeltaArray,
297299
)
300+
from pandas.core.arrays.arrow import ArrowDtype
298301
from pandas.core.arrays.string_ import StringDtype
299302

300303
if lib.is_scalar(data):
@@ -362,6 +365,30 @@ def array(
362365
elif inferred_dtype == "boolean":
363366
return BooleanArray._from_sequence(data, copy=copy)
364367

368+
elif inferred_dtype == "time":
369+
opt = get_option("future.infer_time")
370+
371+
if opt is True:
372+
import pyarrow as pa
373+
374+
obj = pa.array(data)
375+
dtype = ArrowDtype(obj.type)
376+
return dtype.construct_array_type()(obj)
377+
elif opt is False:
378+
# explicitly set to keep the old behavior and avoid the warning
379+
pass
380+
else:
381+
warnings.warn(
382+
"Pandas type inference with a sequence of `datetime.time` "
383+
"objects is deprecated. In a future version, this will give "
384+
"time32[pyarrow] dtype, which will require pyarrow to be "
385+
"installed. To opt in to the new behavior immediately set "
386+
"`pd.set_option('future.infer_time', True)`. To keep the "
387+
"old behavior pass `dtype=object`.",
388+
FutureWarning,
389+
stacklevel=find_stack_level(),
390+
)
391+
365392
# Pandas overrides NumPy for
366393
# 1. datetime64[ns,us,ms,s]
367394
# 2. timedelta64[ns,us,ms,s]

pandas/core/indexes/accessors.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def _delegate_property_get(self, name: str): # type: ignore[override]
108108
else:
109109
index = self._parent.index
110110
# return the result as a Series
111-
result = Series(result, index=index, name=self.name).__finalize__(self._parent)
111+
result = Series(
112+
result, index=index, name=self.name, dtype=result.dtype
113+
).__finalize__(self._parent)
112114

113115
# setting this object will show a SettingWithCopyWarning/Error
114116
result._is_copy = (

pandas/tests/arithmetic/test_datetime64.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,7 +1166,10 @@ def test_dt64arr_add_sub_parr(
11661166
)
11671167
assert_invalid_addsub_type(dtarr, parr, msg)
11681168

1169-
def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixture):
1169+
@pytest.mark.parametrize("future", [True, False, None])
1170+
def test_dt64arr_addsub_time_objects_raises(
1171+
self, box_with_array, tz_naive_fixture, future
1172+
):
11701173
# https://github.com/pandas-dev/pandas/issues/10329
11711174

11721175
tz = tz_naive_fixture
@@ -1175,15 +1178,23 @@ def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixtu
11751178
obj2 = [time(i, i, i) for i in range(3)]
11761179

11771180
obj1 = tm.box_expected(obj1, box_with_array)
1178-
obj2 = tm.box_expected(obj2, box_with_array)
1179-
1180-
msg = "|".join(
1181-
[
1182-
"unsupported operand",
1183-
"cannot subtract DatetimeArray from ndarray",
1184-
]
1185-
)
11861181

1182+
msgs = [
1183+
"unsupported operand",
1184+
"cannot subtract DatetimeArray from ndarray",
1185+
]
1186+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
1187+
warn = None
1188+
if future is True:
1189+
msgs.append("cannot subtract DatetimeArray from ArrowExtensionArray")
1190+
elif future is None:
1191+
warn = FutureWarning
1192+
1193+
with pd.option_context("future.infer_time", future):
1194+
with tm.assert_produces_warning(warn, match=warn_msg):
1195+
obj2 = tm.box_expected(obj2, box_with_array)
1196+
1197+
msg = "|".join(msgs)
11871198
with warnings.catch_warnings(record=True):
11881199
# pandas.errors.PerformanceWarning: Non-vectorized DateOffset being
11891200
# applied to Series or DatetimeIndex

pandas/tests/dtypes/test_inference.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,56 @@ def test_maybe_convert_objects_ea(self, idx):
10381038
)
10391039
tm.assert_extension_array_equal(result, idx._data)
10401040

1041+
@pytest.mark.parametrize("future", [True, False, None])
1042+
def test_maybe_convert_objects_time(self, future):
1043+
ts = Timestamp.now()
1044+
objs = np.array([ts.time()], dtype=object)
1045+
1046+
msg = "Pandas type inference with a sequence of `datetime.time` objects"
1047+
warn = None
1048+
if future is True:
1049+
pa = pytest.importorskip("pyarrow")
1050+
dtype = pd.ArrowDtype(pa.time64("us"))
1051+
exp = dtype.construct_array_type()._from_sequence(objs, dtype=dtype)
1052+
else:
1053+
if future is None:
1054+
warn = FutureWarning
1055+
exp = objs
1056+
1057+
with pd.option_context("future.infer_time", future):
1058+
with tm.assert_produces_warning(warn, match=msg):
1059+
out = lib.maybe_convert_objects(objs, convert_time=True)
1060+
with tm.assert_produces_warning(warn, match=msg):
1061+
ser = Series(objs)
1062+
with tm.assert_produces_warning(warn, match=msg):
1063+
ser2 = Series(list(objs))
1064+
with tm.assert_produces_warning(warn, match=msg):
1065+
df = DataFrame(objs)
1066+
with tm.assert_produces_warning(warn, match=msg):
1067+
df2 = DataFrame(list(objs))
1068+
with tm.assert_produces_warning(warn, match=msg):
1069+
idx = Index(objs)
1070+
with tm.assert_produces_warning(warn, match=msg):
1071+
idx2 = Index(list(objs))
1072+
with tm.assert_produces_warning(warn, match=msg):
1073+
arr = pd.array(objs)
1074+
with tm.assert_produces_warning(warn, match=msg):
1075+
arr2 = pd.array(list(objs))
1076+
1077+
tm.assert_equal(out, exp)
1078+
if future:
1079+
tm.assert_equal(arr, exp)
1080+
tm.assert_equal(arr2, exp)
1081+
else:
1082+
tm.assert_equal(arr, pd.core.arrays.PandasArray(exp))
1083+
tm.assert_equal(arr2, pd.core.arrays.PandasArray(exp))
1084+
tm.assert_series_equal(ser, Series(exp, dtype=exp.dtype))
1085+
tm.assert_series_equal(ser2, Series(exp, dtype=exp.dtype))
1086+
tm.assert_frame_equal(df, DataFrame(exp, dtype=exp.dtype))
1087+
tm.assert_frame_equal(df2, DataFrame(exp, dtype=exp.dtype))
1088+
tm.assert_index_equal(idx, Index(exp, dtype=exp.dtype))
1089+
tm.assert_index_equal(idx2, Index(exp, dtype=exp.dtype))
1090+
10411091

10421092
class TestTypeInference:
10431093
# Dummy class used for testing with Python objects

pandas/tests/extension/test_arrow.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,28 @@ class TestBaseReshaping(base.BaseReshapingTests):
744744
def test_transpose(self, data):
745745
super().test_transpose(data)
746746

747+
@pytest.mark.parametrize(
748+
"columns",
749+
[
750+
["A", "B"],
751+
pd.MultiIndex.from_tuples(
752+
[("A", "a"), ("A", "b")], names=["outer", "inner"]
753+
),
754+
],
755+
)
756+
def test_stack(self, data, columns):
757+
warn = None
758+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
759+
760+
pa_dtype = data.dtype.pyarrow_dtype
761+
if pa.types.is_time(pa_dtype):
762+
# FIXME: need to avoid doing inference when calling frame._constructor
763+
# in _stack_multi_columns
764+
warn = FutureWarning
765+
766+
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
767+
super().test_stack(data, columns)
768+
747769

748770
class TestBaseSetitem(base.BaseSetitemTests):
749771
@pytest.mark.xfail(
@@ -806,6 +828,18 @@ def test_invert(self, data, request):
806828

807829

808830
class TestBaseMethods(base.BaseMethodsTests):
831+
def test_hash_pandas_object_works(self, data, as_frame):
832+
pa_dtype = data.dtype.pyarrow_dtype
833+
warn_msg = "Pandas type inference with a sequence of `datetime.time`"
834+
warn = None
835+
if pa.types.is_time(pa_dtype):
836+
# TODO(#48964) This warning will be avoided by implementing
837+
# ArrowExtensionArray.hash_pandas_object
838+
warn = FutureWarning
839+
840+
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
841+
super().test_hash_pandas_object_works(data, as_frame)
842+
809843
@pytest.mark.parametrize("periods", [1, -2])
810844
def test_diff(self, data, periods, request):
811845
pa_dtype = data.dtype.pyarrow_dtype

pandas/tests/groupby/test_apply.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import (
22
date,
33
datetime,
4+
time,
45
)
56
from io import StringIO
67

@@ -836,7 +837,16 @@ def test_apply_datetime_issue(group_column_dtlike):
836837
# is a datetime object and the column labels are different from
837838
# standard int values in range(len(num_columns))
838839

839-
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
840+
warn = None
841+
warn_msg = (
842+
"Pandas type inference with a sequence of `datetime.time` "
843+
"objects is deprecated"
844+
)
845+
if isinstance(group_column_dtlike, time):
846+
warn = FutureWarning
847+
848+
with tm.assert_produces_warning(warn, match=warn_msg):
849+
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
840850
result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
841851

842852
expected = DataFrame(

pandas/tests/io/excel/test_readers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -986,13 +986,17 @@ def test_reader_seconds(self, request, engine, read_ext):
986986
time(16, 37, 0, 900000),
987987
time(18, 20, 54),
988988
]
989-
}
989+
},
990+
dtype=object,
990991
)
991992

992-
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
993+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
994+
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
995+
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
993996
tm.assert_frame_equal(actual, expected)
994997

995-
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
998+
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
999+
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
9961000
tm.assert_frame_equal(actual, expected)
9971001

9981002
def test_read_excel_multiindex(self, request, read_ext):

0 commit comments

Comments
 (0)