Skip to content

Commit ac3cc84

Browse files
committed
Merge branch 'factorize_na_v2' of https://github.com/rhshadrach/pandas into add_numeric_only_gb
2 parents 58e9ddc + 557dfd5 commit ac3cc84

31 files changed

+312
-96
lines changed

doc/source/whatsnew/v1.5.0.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ Timedelta
481481

482482
Time Zones
483483
^^^^^^^^^^
484-
-
484+
- Bug in :class:`Timestamp` constructor raising when passed a ``ZoneInfo`` tzinfo object (:issue:`46425`)
485485
-
486486

487487
Numeric
@@ -569,6 +569,8 @@ I/O
569569
- Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
570570
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
571571
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
572+
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
573+
-
572574

573575
Period
574576
^^^^^^
@@ -599,7 +601,7 @@ Groupby/resample/rolling
599601
- Bug in :meth:`GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`)
600602
- Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`)
601603
- Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`)
602-
-
604+
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
603605

604606
Reshaping
605607
^^^^^^^^^

pandas/_libs/algos_common_helper.pxi.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ def ensure_{{name}}(object arr, copy=True):
6565
if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
6666
return arr
6767
else:
68-
return arr.astype(np.{{dtype}}, copy=copy)
68+
# equiv: arr.astype(np.{{dtype}}, copy=copy)
69+
return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_{{c_type}})
6970
else:
7071
return np.array(arr, dtype=np.{{dtype}})
7172

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable):
658658
return_inverse=return_inverse)
659659

660660
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
661-
object na_value=None, object mask=None):
661+
object na_value=None, object mask=None, ignore_na=True):
662662
"""
663663
Calculate unique values and labels (no sorting!)
664664

@@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable):
690690
"""
691691
uniques_vector = {{name}}Vector()
692692
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
693-
na_value=na_value, ignore_na=True, mask=mask,
693+
na_value=na_value, ignore_na=ignore_na, mask=mask,
694694
return_inverse=True)
695695

696696
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable):
10371037
return_inverse=return_inverse)
10381038

10391039
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1040-
object na_value=None, object mask=None):
1040+
object na_value=None, object mask=None, ignore_na=True):
10411041
"""
10421042
Calculate unique values and labels (no sorting!)
10431043

@@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable):
10671067
"""
10681068
uniques_vector = ObjectVector()
10691069
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1070-
na_value=na_value, ignore_na=True,
1070+
na_value=na_value, ignore_na=ignore_na,
10711071
return_inverse=True)
10721072

10731073
def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable):
12901290
return_inverse=return_inverse)
12911291

12921292
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1293-
object na_value=None, object mask=None):
1293+
object na_value=None, object mask=None, ignore_na=True):
12941294
"""
12951295
Calculate unique values and labels (no sorting!)
12961296

@@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable):
13201320
"""
13211321
uniques_vector = ObjectVector()
13221322
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1323-
na_value=na_value, ignore_na=True,
1323+
na_value=na_value, ignore_na=ignore_na,
13241324
return_inverse=True)
13251325

13261326
def get_labels(self, ndarray[object] values, ObjectVector uniques,

pandas/_libs/tslib.pyx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ cpdef array_to_datetime(
424424
"""
425425
cdef:
426426
Py_ssize_t i, n = len(values)
427-
object val, py_dt, tz, tz_out = None
427+
object val, tz
428428
ndarray[int64_t] iresult
429429
ndarray[object] oresult
430430
npy_datetimestruct dts
@@ -443,6 +443,8 @@ cpdef array_to_datetime(
443443
float offset_seconds, tz_offset
444444
set out_tzoffset_vals = set()
445445
bint string_to_dts_failed
446+
datetime py_dt
447+
tzinfo tz_out = None
446448

447449
# specify error conditions
448450
assert is_raise or is_ignore or is_coerce
@@ -647,6 +649,8 @@ cpdef array_to_datetime(
647649
return result, tz_out
648650

649651

652+
@cython.wraparound(False)
653+
@cython.boundscheck(False)
650654
cdef ndarray[object] ignore_errors_out_of_bounds_fallback(ndarray[object] values):
651655
"""
652656
Fallback for array_to_datetime if an OutOfBoundsDatetime is raised

pandas/_libs/tslibs/conversion.pyx

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ from pandas._libs.tslibs.timezones cimport (
5656
is_fixed_offset,
5757
is_tzlocal,
5858
is_utc,
59+
is_zoneinfo,
5960
maybe_get_tz,
6061
tz_compare,
6162
utc_pytz as UTC,
@@ -532,7 +533,7 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts,
532533
# see PEP 495 https://www.python.org/dev/peps/pep-0495/#the-fold-attribute
533534
if is_utc(tz):
534535
pass
535-
elif is_tzlocal(tz):
536+
elif is_tzlocal(tz) or is_zoneinfo(tz):
536537
localize_tzinfo_api(obj.value, tz, &obj.fold)
537538
else:
538539
trans, deltas, typ = get_dst_info(tz)
@@ -581,55 +582,62 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit,
581582
"""
582583
cdef:
583584
npy_datetimestruct dts
584-
int out_local = 0, out_tzoffset = 0
585-
bint do_parse_datetime_string = False
585+
int out_local = 0, out_tzoffset = 0, string_to_dts_failed
586+
datetime dt
587+
int64_t ival
586588

587589
if len(ts) == 0 or ts in nat_strings:
588590
ts = NaT
591+
obj = _TSObject()
592+
obj.value = NPY_NAT
593+
obj.tzinfo = tz
594+
return obj
589595
elif ts == 'now':
590596
# Issue 9000, we short-circuit rather than going
591597
# into np_datetime_strings which returns utc
592-
ts = datetime.now(tz)
598+
dt = datetime.now(tz)
593599
elif ts == 'today':
594600
# Issue 9000, we short-circuit rather than going
595601
# into np_datetime_strings which returns a normalized datetime
596-
ts = datetime.now(tz)
602+
dt = datetime.now(tz)
597603
# equiv: datetime.today().replace(tzinfo=tz)
598604
else:
599605
string_to_dts_failed = _string_to_dts(
600606
ts, &dts, &out_local,
601607
&out_tzoffset, False
602608
)
603-
try:
604-
if not string_to_dts_failed:
609+
if not string_to_dts_failed:
610+
try:
605611
check_dts_bounds(&dts)
606612
if out_local == 1:
607613
return _create_tsobject_tz_using_offset(dts,
608614
out_tzoffset, tz)
609615
else:
610-
ts = dtstruct_to_dt64(&dts)
616+
ival = dtstruct_to_dt64(&dts)
611617
if tz is not None:
612618
# shift for _localize_tso
613-
ts = tz_localize_to_utc_single(ts, tz,
614-
ambiguous="raise")
619+
ival = tz_localize_to_utc_single(ival, tz,
620+
ambiguous="raise")
615621

616-
except OutOfBoundsDatetime:
617-
# GH#19382 for just-barely-OutOfBounds falling back to dateutil
618-
# parser will return incorrect result because it will ignore
619-
# nanoseconds
620-
raise
622+
return convert_to_tsobject(ival, tz, None, False, False)
621623

622-
except ValueError:
623-
do_parse_datetime_string = True
624+
except OutOfBoundsDatetime:
625+
# GH#19382 for just-barely-OutOfBounds falling back to dateutil
626+
# parser will return incorrect result because it will ignore
627+
# nanoseconds
628+
raise
624629

625-
if string_to_dts_failed or do_parse_datetime_string:
626-
try:
627-
ts = parse_datetime_string(ts, dayfirst=dayfirst,
628-
yearfirst=yearfirst)
629-
except (ValueError, OverflowError):
630-
raise ValueError("could not convert string to Timestamp")
630+
except ValueError:
631+
# Fall through to parse_datetime_string
632+
pass
633+
634+
try:
635+
dt = parse_datetime_string(ts, dayfirst=dayfirst,
636+
yearfirst=yearfirst)
637+
except (ValueError, OverflowError):
638+
raise ValueError("could not convert string to Timestamp")
631639

632-
return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst)
640+
return convert_datetime_to_tsobject(dt, tz)
633641

634642

635643
cdef inline check_overflows(_TSObject obj):
@@ -688,12 +696,8 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz):
688696
Sets obj.tzinfo inplace, alters obj.dts inplace.
689697
"""
690698
cdef:
691-
ndarray[int64_t] trans
692-
int64_t[::1] deltas
693699
int64_t local_val
694-
int64_t* tdata
695-
Py_ssize_t pos, ntrans, outpos = -1
696-
str typ
700+
Py_ssize_t outpos = -1
697701

698702
assert obj.tzinfo is None
699703

pandas/_libs/tslibs/timestamps.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ cdef class _Timestamp(ABCTimestamp):
397397
elif is_datetime64_object(other):
398398
return type(self)(other) - self
399399
return NotImplemented
400+
400401
# -----------------------------------------------------------------
401402

402403
cdef int64_t _maybe_convert_value_to_local(self):

pandas/_libs/tslibs/timezones.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ cdef tzinfo utc_pytz
99

1010
cpdef bint is_utc(tzinfo tz)
1111
cdef bint is_tzlocal(tzinfo tz)
12+
cdef bint is_zoneinfo(tzinfo tz)
1213

1314
cdef bint treat_tz_as_pytz(tzinfo tz)
1415

pandas/_libs/tslibs/timezones.pyx

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@ from datetime import (
33
timezone,
44
)
55

6+
try:
7+
# py39+
8+
import zoneinfo
9+
from zoneinfo import ZoneInfo
10+
except ImportError:
11+
zoneinfo = None
12+
ZoneInfo = None
13+
614
from cpython.datetime cimport (
715
datetime,
816
timedelta,
@@ -42,18 +50,43 @@ cdef tzinfo utc_stdlib = timezone.utc
4250
cdef tzinfo utc_pytz = UTC
4351
cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc()
4452

53+
cdef tzinfo utc_zoneinfo = None
54+
4555

4656
# ----------------------------------------------------------------------
4757

58+
cdef inline bint is_utc_zoneinfo(tzinfo tz):
59+
# Workaround for cases with missing tzdata
60+
# https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025
61+
if tz is None or zoneinfo is None:
62+
return False
63+
64+
global utc_zoneinfo
65+
if utc_zoneinfo is None:
66+
try:
67+
utc_zoneinfo = ZoneInfo("UTC")
68+
except zoneinfo.ZoneInfoNotFoundError:
69+
return False
70+
71+
return tz is utc_zoneinfo
72+
73+
4874
cpdef inline bint is_utc(tzinfo tz):
4975
return (
5076
tz is utc_pytz
5177
or tz is utc_stdlib
5278
or isinstance(tz, _dateutil_tzutc)
5379
or tz is utc_dateutil_str
80+
or is_utc_zoneinfo(tz)
5481
)
5582

5683

84+
cdef inline bint is_zoneinfo(tzinfo tz):
85+
if ZoneInfo is None:
86+
return False
87+
return isinstance(tz, ZoneInfo)
88+
89+
5790
cdef inline bint is_tzlocal(tzinfo tz):
5891
return isinstance(tz, _dateutil_tzlocal)
5992

@@ -210,6 +243,8 @@ cdef inline bint is_fixed_offset(tzinfo tz):
210243
return 1
211244
else:
212245
return 0
246+
elif is_zoneinfo(tz):
247+
return 0
213248
# This also implicitly accepts datetime.timezone objects which are
214249
# considered fixed
215250
return 1
@@ -264,6 +299,8 @@ cdef object get_dst_info(tzinfo tz):
264299
# e.g. pytz.FixedOffset, matplotlib.dates._UTC,
265300
# psycopg2.tz.FixedOffsetTimezone
266301
num = int(get_utcoffset(tz, None).total_seconds()) * 1_000_000_000
302+
# If we have e.g. ZoneInfo here, the get_utcoffset call will return None,
303+
# so the total_seconds() call will raise AttributeError.
267304
return (np.array([NPY_NAT + 1], dtype=np.int64),
268305
np.array([num], dtype=np.int64),
269306
"unknown")
@@ -291,13 +328,13 @@ cdef object get_dst_info(tzinfo tz):
291328
# deltas
292329
deltas = np.array([v.offset for v in (
293330
tz._ttinfo_before,) + tz._trans_idx], dtype='i8')
294-
deltas *= 1000000000
331+
deltas *= 1_000_000_000
295332
typ = 'dateutil'
296333

297334
elif is_fixed_offset(tz):
298335
trans = np.array([NPY_NAT + 1], dtype=np.int64)
299336
deltas = np.array([tz._ttinfo_std.offset],
300-
dtype='i8') * 1000000000
337+
dtype='i8') * 1_000_000_000
301338
typ = 'fixed'
302339
else:
303340
# 2018-07-12 this is not reached in the tests, and this case

pandas/_libs/tslibs/tzconversion.pyx

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ from pandas._libs.tslibs.timezones cimport (
4242
is_fixed_offset,
4343
is_tzlocal,
4444
is_utc,
45+
is_zoneinfo,
4546
utc_pytz,
4647
)
4748

@@ -60,7 +61,7 @@ cdef int64_t tz_localize_to_utc_single(
6061
elif is_utc(tz) or tz is None:
6162
return val
6263

63-
elif is_tzlocal(tz):
64+
elif is_tzlocal(tz) or is_zoneinfo(tz):
6465
return val - _tz_localize_using_tzinfo_api(val, tz, to_utc=True)
6566

6667
elif is_fixed_offset(tz):
@@ -135,7 +136,7 @@ timedelta-like}
135136

136137
result = np.empty(n, dtype=np.int64)
137138

138-
if is_tzlocal(tz):
139+
if is_tzlocal(tz) or is_zoneinfo(tz):
139140
for i in range(n):
140141
v = vals[i]
141142
if v == NPY_NAT:
@@ -484,8 +485,8 @@ cdef int64_t tz_convert_from_utc_single(
484485

485486
if is_utc(tz):
486487
return utc_val
487-
elif is_tzlocal(tz):
488-
return utc_val + _tz_localize_using_tzinfo_api(utc_val, tz, to_utc=False)
488+
elif is_tzlocal(tz) or is_zoneinfo(tz):
489+
return utc_val + _tz_localize_using_tzinfo_api(utc_val, tz, to_utc=False, fold=fold)
489490
else:
490491
trans, deltas, typ = get_dst_info(tz)
491492
tdata = <int64_t*>cnp.PyArray_DATA(trans)
@@ -569,7 +570,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] stamps, tzinfo tz):
569570

570571
if is_utc(tz) or tz is None:
571572
use_utc = True
572-
elif is_tzlocal(tz):
573+
elif is_tzlocal(tz) or is_zoneinfo(tz):
573574
use_tzlocal = True
574575
else:
575576
trans, deltas, typ = get_dst_info(tz)

0 commit comments

Comments
 (0)