Skip to content

Commit 44ee29c

Browse files
committed
POC: infer time objects to ArrowDtype[time]
1 parent 8cf4ab4 commit 44ee29c

File tree

16 files changed

+266
-28
lines changed

16 files changed

+266
-28
lines changed

pandas/_libs/lib.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def map_infer(
7070
convert: bool = ...,
7171
ignore_na: bool = ...,
7272
) -> np.ndarray: ...
73+
7374
@overload
7475
def maybe_convert_objects(
7576
objects: npt.NDArray[np.object_],

pandas/_libs/lib.pyx

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from typing import (
66
Literal,
77
_GenericAlias,
88
)
9+
import warnings
910

1011
cimport cython
1112
from cpython.datetime cimport (
@@ -99,6 +100,8 @@ cdef extern from "pd_parser.h":
99100

100101
PandasParser_IMPORT
101102

103+
from pandas._config import get_option
104+
102105
from pandas._libs cimport util
103106
from pandas._libs.util cimport (
104107
INT64_MAX,
@@ -1258,6 +1261,7 @@ cdef class Seen:
12581261
bint datetimetz_ # seen_datetimetz
12591262
bint period_ # seen_period
12601263
bint interval_ # seen_interval
1264+
bint time_
12611265

12621266
def __cinit__(self, bint coerce_numeric=False):
12631267
"""
@@ -1284,6 +1288,7 @@ cdef class Seen:
12841288
self.datetimetz_ = False
12851289
self.period_ = False
12861290
self.interval_ = False
1291+
self.time_ = False
12871292
self.coerce_numeric = coerce_numeric
12881293

12891294
cdef bint check_uint64_conflict(self) except -1:
@@ -2567,6 +2572,12 @@ def maybe_convert_objects(ndarray[object] objects,
25672572
else:
25682573
seen.object_ = True
25692574
break
2575+
elif PyTime_Check(val):
2576+
if convert_time:
2577+
seen.time_ = True
2578+
else:
2579+
seen.object_ = True
2580+
break
25702581
else:
25712582
seen.object_ = True
25722583
break
@@ -2631,7 +2642,36 @@ def maybe_convert_objects(ndarray[object] objects,
26312642

26322643
seen.object_ = True
26332644

2634-
elif seen.nat_:
2645+
elif seen.time_:
2646+
if is_time_array(objects):
2647+
opt = get_option("future.infer_time")
2648+
if opt is True:
2649+
import pyarrow as pa
2650+
2651+
from pandas.core.arrays.arrow import ArrowDtype
2652+
2653+
obj = pa.array(objects)
2654+
dtype = ArrowDtype(obj.type)
2655+
return dtype.construct_array_type()(obj)
2656+
elif opt is False:
2657+
# explicitly set to keep the old behavior and avoid the warning
2658+
pass
2659+
else:
2660+
from pandas.util._exceptions import find_stack_level
2661+
warnings.warn(
2662+
"Pandas type inference with a sequence of `datetime.time` "
2663+
"objects is deprecated. In a future version, this will give "
2664+
"time32[pyarrow] dtype, which will require pyarrow to be "
2665+
"installed. To opt in to the new behavior immediately set "
2666+
"`pd.set_option('future.infer_time', True)`. To keep the "
2667+
"old behavior pass `dtype=object`.",
2668+
FutureWarning,
2669+
stacklevel=find_stack_level(),
2670+
)
2671+
2672+
seen.object_ = True
2673+
2674+
if seen.nat_:
26352675
if not seen.object_ and not seen.numeric_ and not seen.bool_:
26362676
# all NaT, None, or nan (at least one NaT)
26372677
# see GH#49340 for discussion of desired behavior

pandas/core/config_init.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,3 +881,14 @@ def register_converter_cb(key) -> None:
881881
styler_environment,
882882
validator=is_instance_factory([type(None), str]),
883883
)
884+
885+
886+
with cf.config_prefix("future"):
887+
cf.register_option(
888+
"future.infer_time",
889+
None,
890+
"Whether to infer sequence of datetime.time objects as pyarrow time "
891+
"dtype, which will be the default in pandas 3.0 "
892+
"(at which point this option will be deprecated).",
893+
validator=is_one_of_factory([True, False, None]),
894+
)

pandas/core/construction.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,13 @@
1414
cast,
1515
overload,
1616
)
17+
import warnings
1718

1819
import numpy as np
1920
from numpy import ma
2021

22+
from pandas._config import get_option
23+
2124
from pandas._libs import lib
2225
from pandas._libs.tslibs import (
2326
Period,
@@ -31,6 +34,7 @@
3134
DtypeObj,
3235
T,
3336
)
37+
from pandas.util._exceptions import find_stack_level
3438

3539
from pandas.core.dtypes.base import ExtensionDtype
3640
from pandas.core.dtypes.cast import (
@@ -293,6 +297,7 @@ def array(
293297
PeriodArray,
294298
TimedeltaArray,
295299
)
300+
from pandas.core.arrays.arrow import ArrowDtype
296301
from pandas.core.arrays.string_ import StringDtype
297302

298303
if lib.is_scalar(data):
@@ -360,6 +365,30 @@ def array(
360365
elif inferred_dtype == "boolean":
361366
return BooleanArray._from_sequence(data, copy=copy)
362367

368+
elif inferred_dtype == "time":
369+
opt = get_option("future.infer_time")
370+
371+
if opt is True:
372+
import pyarrow as pa
373+
374+
obj = pa.array(data)
375+
dtype = ArrowDtype(obj.type)
376+
return dtype.construct_array_type()(obj)
377+
elif opt is False:
378+
# explicitly set to keep the old behavior and avoid the warning
379+
pass
380+
else:
381+
warnings.warn(
382+
"Pandas type inference with a sequence of `datetime.time` "
383+
"objects is deprecated. In a future version, this will give "
384+
"time32[pyarrow] dtype, which will require pyarrow to be "
385+
"installed. To opt in to the new behavior immediately set "
386+
"`pd.set_option('future.infer_time', True)`. To keep the "
387+
"old behavior pass `dtype=object`.",
388+
FutureWarning,
389+
stacklevel=find_stack_level(),
390+
)
391+
363392
# Pandas overrides NumPy for
364393
# 1. datetime64[ns,us,ms,s]
365394
# 2. timedelta64[ns,us,ms,s]

pandas/core/indexes/accessors.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def _delegate_property_get(self, name: str): # type: ignore[override]
108108
else:
109109
index = self._parent.index
110110
# return the result as a Series
111-
result = Series(result, index=index, name=self.name).__finalize__(self._parent)
111+
result = Series(
112+
result, index=index, name=self.name, dtype=result.dtype
113+
).__finalize__(self._parent)
112114

113115
# setting this object will show a SettingWithCopyWarning/Error
114116
result._is_copy = (

pandas/tests/arithmetic/test_datetime64.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,7 +1166,10 @@ def test_dt64arr_add_sub_parr(
11661166
)
11671167
assert_invalid_addsub_type(dtarr, parr, msg)
11681168

1169-
def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixture):
1169+
@pytest.mark.parametrize("future", [True, False, None])
1170+
def test_dt64arr_addsub_time_objects_raises(
1171+
self, box_with_array, tz_naive_fixture, future
1172+
):
11701173
# https://github.com/pandas-dev/pandas/issues/10329
11711174

11721175
tz = tz_naive_fixture
@@ -1175,15 +1178,23 @@ def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixtu
11751178
obj2 = [time(i, i, i) for i in range(3)]
11761179

11771180
obj1 = tm.box_expected(obj1, box_with_array)
1178-
obj2 = tm.box_expected(obj2, box_with_array)
1179-
1180-
msg = "|".join(
1181-
[
1182-
"unsupported operand",
1183-
"cannot subtract DatetimeArray from ndarray",
1184-
]
1185-
)
11861181

1182+
msgs = [
1183+
"unsupported operand",
1184+
"cannot subtract DatetimeArray from ndarray",
1185+
]
1186+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
1187+
warn = None
1188+
if future is True:
1189+
msgs.append("cannot subtract DatetimeArray from ArrowExtensionArray")
1190+
elif future is None:
1191+
warn = FutureWarning
1192+
1193+
with pd.option_context("future.infer_time", future):
1194+
with tm.assert_produces_warning(warn, match=warn_msg):
1195+
obj2 = tm.box_expected(obj2, box_with_array)
1196+
1197+
msg = "|".join(msgs)
11871198
with warnings.catch_warnings(record=True):
11881199
# pandas.errors.PerformanceWarning: Non-vectorized DateOffset being
11891200
# applied to Series or DatetimeIndex

pandas/tests/dtypes/test_inference.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,56 @@ def test_maybe_convert_objects_ea(self, idx):
10371037
)
10381038
tm.assert_extension_array_equal(result, idx._data)
10391039

1040+
@pytest.mark.parametrize("future", [True, False, None])
1041+
def test_maybe_convert_objects_time(self, future):
1042+
ts = Timestamp.now()
1043+
objs = np.array([ts.time()], dtype=object)
1044+
1045+
msg = "Pandas type inference with a sequence of `datetime.time` objects"
1046+
warn = None
1047+
if future is True:
1048+
pa = pytest.importorskip("pyarrow")
1049+
dtype = pd.ArrowDtype(pa.time64("us"))
1050+
exp = dtype.construct_array_type()._from_sequence(objs, dtype=dtype)
1051+
else:
1052+
if future is None:
1053+
warn = FutureWarning
1054+
exp = objs
1055+
1056+
with pd.option_context("future.infer_time", future):
1057+
with tm.assert_produces_warning(warn, match=msg):
1058+
out = lib.maybe_convert_objects(objs, convert_time=True)
1059+
with tm.assert_produces_warning(warn, match=msg):
1060+
ser = Series(objs)
1061+
with tm.assert_produces_warning(warn, match=msg):
1062+
ser2 = Series(list(objs))
1063+
with tm.assert_produces_warning(warn, match=msg):
1064+
df = DataFrame(objs)
1065+
with tm.assert_produces_warning(warn, match=msg):
1066+
df2 = DataFrame(list(objs))
1067+
with tm.assert_produces_warning(warn, match=msg):
1068+
idx = Index(objs)
1069+
with tm.assert_produces_warning(warn, match=msg):
1070+
idx2 = Index(list(objs))
1071+
with tm.assert_produces_warning(warn, match=msg):
1072+
arr = pd.array(objs)
1073+
with tm.assert_produces_warning(warn, match=msg):
1074+
arr2 = pd.array(list(objs))
1075+
1076+
tm.assert_equal(out, exp)
1077+
if future:
1078+
tm.assert_equal(arr, exp)
1079+
tm.assert_equal(arr2, exp)
1080+
else:
1081+
tm.assert_equal(arr, pd.core.arrays.PandasArray(exp))
1082+
tm.assert_equal(arr2, pd.core.arrays.PandasArray(exp))
1083+
tm.assert_series_equal(ser, Series(exp, dtype=exp.dtype))
1084+
tm.assert_series_equal(ser2, Series(exp, dtype=exp.dtype))
1085+
tm.assert_frame_equal(df, DataFrame(exp, dtype=exp.dtype))
1086+
tm.assert_frame_equal(df2, DataFrame(exp, dtype=exp.dtype))
1087+
tm.assert_index_equal(idx, Index(exp, dtype=exp.dtype))
1088+
tm.assert_index_equal(idx2, Index(exp, dtype=exp.dtype))
1089+
10401090

10411091
class TestTypeInference:
10421092
# Dummy class used for testing with Python objects

pandas/tests/extension/test_arrow.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,28 @@ class TestBaseReshaping(base.BaseReshapingTests):
713713
def test_transpose(self, data):
714714
super().test_transpose(data)
715715

716+
@pytest.mark.parametrize(
717+
"columns",
718+
[
719+
["A", "B"],
720+
pd.MultiIndex.from_tuples(
721+
[("A", "a"), ("A", "b")], names=["outer", "inner"]
722+
),
723+
],
724+
)
725+
def test_stack(self, data, columns):
726+
warn = None
727+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
728+
729+
pa_dtype = data.dtype.pyarrow_dtype
730+
if pa.types.is_time(pa_dtype):
731+
# FIXME: need to avoid doing inference when calling frame._constructor
732+
# in _stack_multi_columns
733+
warn = FutureWarning
734+
735+
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
736+
super().test_stack(data, columns)
737+
716738

717739
class TestBaseSetitem(base.BaseSetitemTests):
718740
@pytest.mark.xfail(
@@ -775,6 +797,18 @@ def test_invert(self, data, request):
775797

776798

777799
class TestBaseMethods(base.BaseMethodsTests):
800+
def test_hash_pandas_object_works(self, data, as_frame):
801+
pa_dtype = data.dtype.pyarrow_dtype
802+
warn_msg = "Pandas type inference with a sequence of `datetime.time`"
803+
warn = None
804+
if pa.types.is_time(pa_dtype):
805+
# TODO(#48964) This warning will be avoided by implementing
806+
# ArrowExtensionArray.hash_pandas_object
807+
warn = FutureWarning
808+
809+
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
810+
super().test_hash_pandas_object_works(data, as_frame)
811+
778812
@pytest.mark.parametrize("periods", [1, -2])
779813
def test_diff(self, data, periods, request):
780814
pa_dtype = data.dtype.pyarrow_dtype

pandas/tests/groupby/test_apply.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import (
22
date,
33
datetime,
4+
time,
45
)
56
from io import StringIO
67

@@ -836,7 +837,16 @@ def test_apply_datetime_issue(group_column_dtlike):
836837
# is a datetime object and the column labels are different from
837838
# standard int values in range(len(num_columns))
838839

839-
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
840+
warn = None
841+
warn_msg = (
842+
"Pandas type inference with a sequence of `datetime.time` "
843+
"objects is deprecated"
844+
)
845+
if isinstance(group_column_dtlike, time):
846+
warn = FutureWarning
847+
848+
with tm.assert_produces_warning(warn, match=warn_msg):
849+
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
840850
result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
841851

842852
expected = DataFrame(

pandas/tests/io/excel/test_readers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -993,13 +993,17 @@ def test_reader_seconds(self, request, engine, read_ext):
993993
time(16, 37, 0, 900000),
994994
time(18, 20, 54),
995995
]
996-
}
996+
},
997+
dtype=object,
997998
)
998999

999-
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
1000+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
1001+
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
1002+
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
10001003
tm.assert_frame_equal(actual, expected)
10011004

1002-
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
1005+
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
1006+
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
10031007
tm.assert_frame_equal(actual, expected)
10041008

10051009
def test_read_excel_multiindex(self, request, read_ext):

0 commit comments

Comments
 (0)