Skip to content

Commit fc95c06

Browse files
Merge remote-tracking branch 'upstream/master' into arrow-string-array-dtype
2 parents 51f1b1d + 3f67dc3 commit fc95c06

30 files changed

+213
-162
lines changed

doc/source/whatsnew/v1.2.5.rst

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,10 @@ Fixed regressions
1818
- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`)
1919
- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
2020
- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`)
21+
- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`)
2122

2223
.. ---------------------------------------------------------------------------
2324
24-
.. _whatsnew_125.deprecations:
25-
26-
Deprecations
27-
~~~~~~~~~~~~
28-
29-
- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`)
3025
3126
.. _whatsnew_125.bug_fixes:
3227

doc/source/whatsnew/v1.3.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,7 @@ Deprecations
740740
- Deprecated passing arguments as positional (except for ``"codes"``) in :meth:`MultiIndex.codes` (:issue:`41485`)
741741
- Deprecated passing arguments as positional in :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) (:issue:`41485`)
742742
- Deprecated passing arguments (apart from ``cond`` and ``other``) as positional in :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`41485`)
743+
- Deprecated passing arguments as positional in :meth:`Resampler.interpolate` (other than ``"method"``) (:issue:`41485`)
743744
- Deprecated passing arguments as positional in :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``"upper"`` and ``"lower"``) (:issue:`41485`)
744745
- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`)
745746
- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`)
@@ -761,6 +762,7 @@ Deprecations
761762
- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)
762763
- Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`)
763764
- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_csv` (:issue:`41485`)
765+
- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`)
764766
- Deprecated passing arguments as positional in :meth:`DataFrame.drop` (other than ``"labels"``) and :meth:`Series.drop` (:issue:`41485`)
765767
- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_table` (:issue:`41485`)
766768

@@ -946,6 +948,7 @@ Conversion
946948
- Bug in :meth:`qcut` raising error when taking ``Float64DType`` as input (:issue:`40730`)
947949
- Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`)
948950
- Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`)
951+
- Bug in :class:`DataFrame` construction when given a two-dimensional object-dtype ``np.ndarray`` of :class:`Period` or :class:`Interval` objects failing to cast to :class:`PeriodDtype` or :class:`IntervalDtype`, respectively (:issue:`41812`)
949952

950953
Strings
951954
^^^^^^^

pandas/_libs/intervaltree.pxi.in

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ cdef class IntervalTree(IntervalMixin):
3131
we are emulating the IndexEngine interface
3232
"""
3333
cdef readonly:
34-
object left, right, root, dtype
34+
ndarray left, right
35+
IntervalNode root
36+
object dtype
3537
str closed
3638
object _is_overlapping, _left_sorter, _right_sorter
3739

@@ -203,6 +205,41 @@ cdef sort_values_and_indices(all_values, all_indices, subset):
203205
# Nodes
204206
# ----------------------------------------------------------------------
205207

208+
@cython.internal
209+
cdef class IntervalNode:
210+
cdef readonly:
211+
int64_t n_elements, n_center, leaf_size
212+
bint is_leaf_node
213+
214+
def __repr__(self) -> str:
215+
if self.is_leaf_node:
216+
return (
217+
f"<{type(self).__name__}: {self.n_elements} elements (terminal)>"
218+
)
219+
else:
220+
n_left = self.left_node.n_elements
221+
n_right = self.right_node.n_elements
222+
n_center = self.n_elements - n_left - n_right
223+
return (
224+
f"<{type(self).__name__}: "
225+
f"pivot {self.pivot}, {self.n_elements} elements "
226+
f"({n_left} left, {n_right} right, {n_center} overlapping)>"
227+
)
228+
229+
def counts(self):
230+
"""
231+
Inspect counts on this node
232+
useful for debugging purposes
233+
"""
234+
if self.is_leaf_node:
235+
return self.n_elements
236+
else:
237+
m = len(self.center_left_values)
238+
l = self.left_node.counts()
239+
r = self.right_node.counts()
240+
return (m, (l, r))
241+
242+
206243
# we need specialized nodes and leaves to optimize for different dtype and
207244
# closed values
208245

@@ -240,7 +277,7 @@ NODE_CLASSES = {}
240277

241278

242279
@cython.internal
243-
cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode:
280+
cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode):
244281
"""Non-terminal node for an IntervalTree
245282

246283
Categorizes intervals by those that fall to the left, those that fall to
@@ -252,8 +289,6 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode:
252289
int64_t[:] center_left_indices, center_right_indices, indices
253290
{{dtype}}_t min_left, max_right
254291
{{dtype}}_t pivot
255-
int64_t n_elements, n_center, leaf_size
256-
bint is_leaf_node
257292

258293
def __init__(self,
259294
ndarray[{{dtype}}_t, ndim=1] left,
@@ -381,31 +416,6 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode:
381416
else:
382417
result.extend(self.center_left_indices)
383418

384-
def __repr__(self) -> str:
385-
if self.is_leaf_node:
386-
return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: '
387-
'%s elements (terminal)>' % self.n_elements)
388-
else:
389-
n_left = self.left_node.n_elements
390-
n_right = self.right_node.n_elements
391-
n_center = self.n_elements - n_left - n_right
392-
return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: '
393-
'pivot %s, %s elements (%s left, %s right, %s '
394-
'overlapping)>' % (self.pivot, self.n_elements,
395-
n_left, n_right, n_center))
396-
397-
def counts(self):
398-
"""
399-
Inspect counts on this node
400-
useful for debugging purposes
401-
"""
402-
if self.is_leaf_node:
403-
return self.n_elements
404-
else:
405-
m = len(self.center_left_values)
406-
l = self.left_node.counts()
407-
r = self.right_node.counts()
408-
return (m, (l, r))
409419

410420
NODE_CLASSES['{{dtype}}',
411421
'{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode

pandas/_libs/lib.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def maybe_convert_objects(
7373
convert_datetime: Literal[False] = ...,
7474
convert_timedelta: bool = ...,
7575
convert_period: Literal[False] = ...,
76+
convert_interval: Literal[False] = ...,
7677
convert_to_nullable_integer: Literal[False] = ...,
7778
dtype_if_all_nat: DtypeObj | None = ...,
7879
) -> np.ndarray: ...
@@ -86,6 +87,7 @@ def maybe_convert_objects(
8687
convert_datetime: bool = ...,
8788
convert_timedelta: bool = ...,
8889
convert_period: bool = ...,
90+
convert_interval: bool = ...,
8991
convert_to_nullable_integer: Literal[True] = ...,
9092
dtype_if_all_nat: DtypeObj | None = ...,
9193
) -> ArrayLike: ...
@@ -99,6 +101,7 @@ def maybe_convert_objects(
99101
convert_datetime: Literal[True] = ...,
100102
convert_timedelta: bool = ...,
101103
convert_period: bool = ...,
104+
convert_interval: bool = ...,
102105
convert_to_nullable_integer: bool = ...,
103106
dtype_if_all_nat: DtypeObj | None = ...,
104107
) -> ArrayLike: ...
@@ -112,6 +115,7 @@ def maybe_convert_objects(
112115
convert_datetime: bool = ...,
113116
convert_timedelta: bool = ...,
114117
convert_period: Literal[True] = ...,
118+
convert_interval: bool = ...,
115119
convert_to_nullable_integer: bool = ...,
116120
dtype_if_all_nat: DtypeObj | None = ...,
117121
) -> ArrayLike: ...
@@ -125,6 +129,7 @@ def maybe_convert_objects(
125129
convert_datetime: bool = ...,
126130
convert_timedelta: bool = ...,
127131
convert_period: bool = ...,
132+
convert_interval: bool = ...,
128133
convert_to_nullable_integer: bool = ...,
129134
dtype_if_all_nat: DtypeObj | None = ...,
130135
) -> ArrayLike: ...

pandas/_libs/lib.pyx

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1573,6 +1573,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
15731573
bint seen_timedelta = False, seen_date = False, seen_datetime = False
15741574
bint seen_tz_aware = False, seen_tz_naive = False
15751575
bint seen_nat = False, seen_str = False
1576+
bint seen_period = False, seen_interval = False
15761577
list objs = []
15771578
object v
15781579

@@ -1610,9 +1611,25 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
16101611
elif is_timedelta(v):
16111612
# timedelta, or timedelta64
16121613
seen_timedelta = True
1614+
elif is_period_object(v):
1615+
seen_period = True
1616+
break
1617+
elif is_interval(v):
1618+
seen_interval = True
1619+
break
16131620
else:
16141621
return "mixed", seen_str
16151622

1623+
if seen_period:
1624+
if is_period_array(arr):
1625+
return "period", seen_str
1626+
return "mixed", seen_str
1627+
1628+
if seen_interval:
1629+
if is_interval_array(arr):
1630+
return "interval", seen_str
1631+
return "mixed", seen_str
1632+
16161633
if seen_date and not (seen_datetime or seen_timedelta):
16171634
return "date", seen_str
16181635
elif seen_datetime and not seen_timedelta:

pandas/core/arrays/datetimelike.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1677,11 +1677,6 @@ class TimelikeOps(DatetimeLikeArrayMixin):
16771677
Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex.
16781678
"""
16791679

1680-
def copy(self: TimelikeOpsT) -> TimelikeOpsT:
1681-
result = super().copy()
1682-
result._freq = self._freq
1683-
return result
1684-
16851680
def _round(self, freq, mode, ambiguous, nonexistent):
16861681
# round the local times
16871682
if is_datetime64tz_dtype(self.dtype):

pandas/core/construction.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
)
3333
from pandas.core.dtypes.cast import (
3434
construct_1d_arraylike_from_scalar,
35-
construct_1d_ndarray_preserving_na,
3635
construct_1d_object_array_from_listlike,
3736
maybe_cast_to_datetime,
3837
maybe_cast_to_integer_array,
@@ -48,7 +47,6 @@
4847
is_integer_dtype,
4948
is_list_like,
5049
is_object_dtype,
51-
is_string_dtype,
5250
is_timedelta64_ns_dtype,
5351
)
5452
from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -570,26 +568,18 @@ def sanitize_array(
570568
if dtype is not None or len(data) == 0:
571569
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
572570
else:
573-
# TODO: copy?
574571
subarr = maybe_convert_platform(data)
575572
if subarr.dtype == object:
576573
subarr = cast(np.ndarray, subarr)
577574
subarr = maybe_infer_to_datetimelike(subarr)
578575

579576
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
580577

581-
if not (
582-
isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype)
583-
):
578+
if isinstance(subarr, np.ndarray):
579+
# at this point we should have dtype be None or subarr.dtype == dtype
580+
dtype = cast(np.dtype, dtype)
584581
subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
585582

586-
is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype)
587-
if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype:
588-
inferred = lib.infer_dtype(subarr, skipna=False)
589-
if inferred in {"interval", "period"}:
590-
subarr = array(subarr)
591-
subarr = extract_array(subarr, extract_numpy=True)
592-
593583
return subarr
594584

595585

@@ -748,6 +738,10 @@ def _try_cast(
748738
return subarr
749739
return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)
750740

741+
elif dtype.kind == "U":
742+
# TODO: test cases with arr.dtype.kind in ["m", "M"]
743+
return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)
744+
751745
elif dtype.kind in ["m", "M"]:
752746
return maybe_cast_to_datetime(arr, dtype)
753747

@@ -757,16 +751,12 @@ def _try_cast(
757751
if is_integer_dtype(dtype):
758752
# this will raise if we have e.g. floats
759753

760-
maybe_cast_to_integer_array(arr, dtype)
761-
subarr = arr
754+
subarr = maybe_cast_to_integer_array(arr, dtype)
762755
else:
763-
subarr = arr
764-
765-
if not isinstance(subarr, ABCExtensionArray):
766756
# 4 tests fail if we move this to a try/except/else; see
767757
# test_constructor_compound_dtypes, test_constructor_cast_failure
768758
# test_constructor_dict_cast2, test_loc_setitem_dtype
769-
subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy)
759+
subarr = np.array(arr, dtype=dtype, copy=copy)
770760

771761
except (ValueError, TypeError):
772762
if raise_cast_failure:

0 commit comments

Comments
 (0)