Skip to content

Commit 1123455

Browse files
committed
POC: infer time objects to ArrowDtype[time]
1 parent a90fbc8 commit 1123455

File tree

16 files changed

+265
-30
lines changed

16 files changed

+265
-30
lines changed

pandas/_libs/lib.pyx

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from typing import (
66
Literal,
77
_GenericAlias,
88
)
9+
import warnings
910

1011
cimport cython
1112
from cpython.datetime cimport (
@@ -99,6 +100,8 @@ cdef extern from "pd_parser.h":
99100

100101
PandasParser_IMPORT
101102

103+
from pandas._config import get_option
104+
102105
from pandas._libs cimport util
103106
from pandas._libs.util cimport (
104107
INT64_MAX,
@@ -1258,6 +1261,7 @@ cdef class Seen:
12581261
bint datetimetz_ # seen_datetimetz
12591262
bint period_ # seen_period
12601263
bint interval_ # seen_interval
1264+
bint time_
12611265

12621266
def __cinit__(self, bint coerce_numeric=False):
12631267
"""
@@ -1284,6 +1288,7 @@ cdef class Seen:
12841288
self.datetimetz_ = False
12851289
self.period_ = False
12861290
self.interval_ = False
1291+
self.time_ = False
12871292
self.coerce_numeric = coerce_numeric
12881293

12891294
cdef bint check_uint64_conflict(self) except -1:
@@ -2567,6 +2572,12 @@ def maybe_convert_objects(ndarray[object] objects,
25672572
else:
25682573
seen.object_ = True
25692574
break
2575+
elif PyTime_Check(val):
2576+
if convert_time:
2577+
seen.time_ = True
2578+
else:
2579+
seen.object_ = True
2580+
break
25702581
else:
25712582
seen.object_ = True
25722583
break
@@ -2631,6 +2642,35 @@ def maybe_convert_objects(ndarray[object] objects,
26312642

26322643
seen.object_ = True
26332644

2645+
elif seen.time_:
2646+
if is_time_array(objects):
2647+
opt = get_option("future.infer_time")
2648+
if opt is True:
2649+
import pyarrow as pa
2650+
2651+
from pandas.core.arrays.arrow import ArrowDtype
2652+
2653+
obj = pa.array(objects)
2654+
dtype = ArrowDtype(obj.type)
2655+
return dtype.construct_array_type()(obj)
2656+
elif opt is False:
2657+
# explicitly set to keep the old behavior and avoid the warning
2658+
pass
2659+
else:
2660+
from pandas.util._exceptions import find_stack_level
2661+
warnings.warn(
2662+
"Pandas type inference with a sequence of `datetime.time` "
2663+
"objects is deprecated. In a future version, this will give "
2664+
"time32[pyarrow] dtype, which will require pyarrow to be "
2665+
"installed. To opt in to the new behavior immediately set "
2666+
"`pd.set_option('future.infer_time', True)`. To keep the "
2667+
"old behavior pass `dtype=object`.",
2668+
FutureWarning,
2669+
stacklevel=find_stack_level(),
2670+
)
2671+
2672+
seen.object_ = True
2673+
26342674
elif seen.nat_:
26352675
if not seen.object_ and not seen.numeric_ and not seen.bool_:
26362676
# all NaT, None, or nan (at least one NaT)

pandas/core/arrays/arrow/array.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -522,9 +522,7 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
522522
pa_scalar = pa.scalar(other)
523523
result = pc_func(self._pa_array, pa_scalar)
524524
else:
525-
raise NotImplementedError(
526-
f"{op.__name__} not implemented for {type(other)}"
527-
)
525+
return NotImplemented
528526
return type(self)(result)
529527

530528
def _logical_method(self, other, op):

pandas/core/config_init.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,3 +881,14 @@ def register_converter_cb(key) -> None:
881881
styler_environment,
882882
validator=is_instance_factory([type(None), str]),
883883
)
884+
885+
886+
with cf.config_prefix("future"):
887+
cf.register_option(
888+
"future.infer_time",
889+
None,
890+
"Whether to infer sequence of datetime.time objects as pyarrow time "
891+
"dtype, which will be the default in pandas 3.0 "
892+
"(at which point this option will be deprecated).",
893+
validator=is_one_of_factory([True, False, None]),
894+
)

pandas/core/construction.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,13 @@
1414
cast,
1515
overload,
1616
)
17+
import warnings
1718

1819
import numpy as np
1920
from numpy import ma
2021

22+
from pandas._config import get_option
23+
2124
from pandas._libs import lib
2225
from pandas._libs.tslibs import (
2326
Period,
@@ -31,6 +34,7 @@
3134
DtypeObj,
3235
T,
3336
)
37+
from pandas.util._exceptions import find_stack_level
3438

3539
from pandas.core.dtypes.base import ExtensionDtype
3640
from pandas.core.dtypes.cast import (
@@ -293,6 +297,7 @@ def array(
293297
PeriodArray,
294298
TimedeltaArray,
295299
)
300+
from pandas.core.arrays.arrow import ArrowDtype
296301
from pandas.core.arrays.string_ import StringDtype
297302

298303
if lib.is_scalar(data):
@@ -360,6 +365,30 @@ def array(
360365
elif inferred_dtype == "boolean":
361366
return BooleanArray._from_sequence(data, copy=copy)
362367

368+
elif inferred_dtype == "time":
369+
opt = get_option("future.infer_time")
370+
371+
if opt is True:
372+
import pyarrow as pa
373+
374+
obj = pa.array(data)
375+
dtype = ArrowDtype(obj.type)
376+
return dtype.construct_array_type()(obj)
377+
elif opt is False:
378+
# explicitly set to keep the old behavior and avoid the warning
379+
pass
380+
else:
381+
warnings.warn(
382+
"Pandas type inference with a sequence of `datetime.time` "
383+
"objects is deprecated. In a future version, this will give "
384+
"time32[pyarrow] dtype, which will require pyarrow to be "
385+
"installed. To opt in to the new behavior immediately set "
386+
"`pd.set_option('future.infer_time', True)`. To keep the "
387+
"old behavior pass `dtype=object`.",
388+
FutureWarning,
389+
stacklevel=find_stack_level(),
390+
)
391+
363392
# Pandas overrides NumPy for
364393
# 1. datetime64[ns,us,ms,s]
365394
# 2. timedelta64[ns,us,ms,s]

pandas/core/indexes/accessors.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def _delegate_property_get(self, name: str): # type: ignore[override]
108108
else:
109109
index = self._parent.index
110110
# return the result as a Series
111-
result = Series(result, index=index, name=self.name).__finalize__(self._parent)
111+
result = Series(
112+
result, index=index, name=self.name, dtype=result.dtype
113+
).__finalize__(self._parent)
112114

113115
# setting this object will show a SettingWithCopyWarning/Error
114116
result._is_copy = (

pandas/tests/arithmetic/test_datetime64.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,7 +1166,10 @@ def test_dt64arr_add_sub_parr(
11661166
)
11671167
assert_invalid_addsub_type(dtarr, parr, msg)
11681168

1169-
def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixture):
1169+
@pytest.mark.parametrize("future", [True, False, None])
1170+
def test_dt64arr_addsub_time_objects_raises(
1171+
self, box_with_array, tz_naive_fixture, future
1172+
):
11701173
# https://github.com/pandas-dev/pandas/issues/10329
11711174

11721175
tz = tz_naive_fixture
@@ -1175,15 +1178,23 @@ def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixtu
11751178
obj2 = [time(i, i, i) for i in range(3)]
11761179

11771180
obj1 = tm.box_expected(obj1, box_with_array)
1178-
obj2 = tm.box_expected(obj2, box_with_array)
1179-
1180-
msg = "|".join(
1181-
[
1182-
"unsupported operand",
1183-
"cannot subtract DatetimeArray from ndarray",
1184-
]
1185-
)
11861181

1182+
msgs = [
1183+
"unsupported operand",
1184+
"cannot subtract DatetimeArray from ndarray",
1185+
]
1186+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
1187+
warn = None
1188+
if future is True:
1189+
msgs.append("cannot subtract DatetimeArray from ArrowExtensionArray")
1190+
elif future is None:
1191+
warn = FutureWarning
1192+
1193+
with pd.option_context("future.infer_time", future):
1194+
with tm.assert_produces_warning(warn, match=warn_msg):
1195+
obj2 = tm.box_expected(obj2, box_with_array)
1196+
1197+
msg = "|".join(msgs)
11871198
with warnings.catch_warnings(record=True):
11881199
# pandas.errors.PerformanceWarning: Non-vectorized DateOffset being
11891200
# applied to Series or DatetimeIndex

pandas/tests/dtypes/test_inference.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,56 @@ def test_maybe_convert_objects_ea(self, idx):
10391039
)
10401040
tm.assert_extension_array_equal(result, idx._data)
10411041

1042+
@pytest.mark.parametrize("future", [True, False, None])
1043+
def test_maybe_convert_objects_time(self, future):
1044+
ts = Timestamp.now()
1045+
objs = np.array([ts.time()], dtype=object)
1046+
1047+
msg = "Pandas type inference with a sequence of `datetime.time` objects"
1048+
warn = None
1049+
if future is True:
1050+
pa = pytest.importorskip("pyarrow")
1051+
dtype = pd.ArrowDtype(pa.time64("us"))
1052+
exp = dtype.construct_array_type()._from_sequence(objs, dtype=dtype)
1053+
else:
1054+
if future is None:
1055+
warn = FutureWarning
1056+
exp = objs
1057+
1058+
with pd.option_context("future.infer_time", future):
1059+
with tm.assert_produces_warning(warn, match=msg):
1060+
out = lib.maybe_convert_objects(objs, convert_time=True)
1061+
with tm.assert_produces_warning(warn, match=msg):
1062+
ser = Series(objs)
1063+
with tm.assert_produces_warning(warn, match=msg):
1064+
ser2 = Series(list(objs))
1065+
with tm.assert_produces_warning(warn, match=msg):
1066+
df = DataFrame(objs)
1067+
with tm.assert_produces_warning(warn, match=msg):
1068+
df2 = DataFrame(list(objs))
1069+
with tm.assert_produces_warning(warn, match=msg):
1070+
idx = Index(objs)
1071+
with tm.assert_produces_warning(warn, match=msg):
1072+
idx2 = Index(list(objs))
1073+
with tm.assert_produces_warning(warn, match=msg):
1074+
arr = pd.array(objs)
1075+
with tm.assert_produces_warning(warn, match=msg):
1076+
arr2 = pd.array(list(objs))
1077+
1078+
tm.assert_equal(out, exp)
1079+
if future:
1080+
tm.assert_equal(arr, exp)
1081+
tm.assert_equal(arr2, exp)
1082+
else:
1083+
tm.assert_equal(arr, pd.core.arrays.PandasArray(exp))
1084+
tm.assert_equal(arr2, pd.core.arrays.PandasArray(exp))
1085+
tm.assert_series_equal(ser, Series(exp, dtype=exp.dtype))
1086+
tm.assert_series_equal(ser2, Series(exp, dtype=exp.dtype))
1087+
tm.assert_frame_equal(df, DataFrame(exp, dtype=exp.dtype))
1088+
tm.assert_frame_equal(df2, DataFrame(exp, dtype=exp.dtype))
1089+
tm.assert_index_equal(idx, Index(exp, dtype=exp.dtype))
1090+
tm.assert_index_equal(idx2, Index(exp, dtype=exp.dtype))
1091+
10421092

10431093
class TestTypeInference:
10441094
# Dummy class used for testing with Python objects

pandas/tests/extension/test_arrow.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,28 @@ class TestBaseReshaping(base.BaseReshapingTests):
712712
def test_transpose(self, data):
713713
super().test_transpose(data)
714714

715+
@pytest.mark.parametrize(
716+
"columns",
717+
[
718+
["A", "B"],
719+
pd.MultiIndex.from_tuples(
720+
[("A", "a"), ("A", "b")], names=["outer", "inner"]
721+
),
722+
],
723+
)
724+
def test_stack(self, data, columns):
725+
warn = None
726+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
727+
728+
pa_dtype = data.dtype.pyarrow_dtype
729+
if pa.types.is_time(pa_dtype):
730+
# FIXME: need to avoid doing inference when calling frame._constructor
731+
# in _stack_multi_columns
732+
warn = FutureWarning
733+
734+
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
735+
super().test_stack(data, columns)
736+
715737

716738
class TestBaseSetitem(base.BaseSetitemTests):
717739
@pytest.mark.xfail(
@@ -774,6 +796,18 @@ def test_invert(self, data, request):
774796

775797

776798
class TestBaseMethods(base.BaseMethodsTests):
799+
def test_hash_pandas_object_works(self, data, as_frame):
800+
pa_dtype = data.dtype.pyarrow_dtype
801+
warn_msg = "Pandas type inference with a sequence of `datetime.time`"
802+
warn = None
803+
if pa.types.is_time(pa_dtype):
804+
# TODO(#48964) This warning will be avoided by implementing
805+
# ArrowExtensionArray.hash_pandas_object
806+
warn = FutureWarning
807+
808+
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
809+
super().test_hash_pandas_object_works(data, as_frame)
810+
777811
@pytest.mark.parametrize("periods", [1, -2])
778812
def test_diff(self, data, periods, request):
779813
pa_dtype = data.dtype.pyarrow_dtype

pandas/tests/groupby/test_apply.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import (
22
date,
33
datetime,
4+
time,
45
)
56
from io import StringIO
67

@@ -834,7 +835,16 @@ def test_apply_datetime_issue(group_column_dtlike):
834835
# is a datetime object and the column labels are different from
835836
# standard int values in range(len(num_columns))
836837

837-
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
838+
warn = None
839+
warn_msg = (
840+
"Pandas type inference with a sequence of `datetime.time` "
841+
"objects is deprecated"
842+
)
843+
if isinstance(group_column_dtlike, time):
844+
warn = FutureWarning
845+
846+
with tm.assert_produces_warning(warn, match=warn_msg):
847+
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
838848
result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
839849

840850
expected = DataFrame(

pandas/tests/io/excel/test_readers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -993,13 +993,17 @@ def test_reader_seconds(self, request, engine, read_ext):
993993
time(16, 37, 0, 900000),
994994
time(18, 20, 54),
995995
]
996-
}
996+
},
997+
dtype=object,
997998
)
998999

999-
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
1000+
warn_msg = "Pandas type inference with a sequence of `datetime.time` objects"
1001+
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
1002+
actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
10001003
tm.assert_frame_equal(actual, expected)
10011004

1002-
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
1005+
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
1006+
actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
10031007
tm.assert_frame_equal(actual, expected)
10041008

10051009
def test_read_excel_multiindex(self, request, read_ext):

0 commit comments

Comments
 (0)