Skip to content

Commit ab4d36d

Browse files
authored
Merge branch 'master' into sort_values_bug
2 parents b6587fc + 300d1fc commit ab4d36d

File tree

108 files changed

+2539
-2049
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+2539
-2049
lines changed

.github/workflows/comment_bot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- name: Install-pre-commit
3030
run: python -m pip install --upgrade pre-commit
3131
- name: Run pre-commit
32-
run: pre-commit run --all-files || (exit 0)
32+
run: pre-commit run --from-ref=origin/master --to-ref=HEAD --all-files || (exit 0)
3333
- name: Commit results
3434
run: |
3535
git config user.name "$(git log -1 --pretty=format:%an)"

asv_bench/benchmarks/series_methods.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def setup(self):
108108
self.vals_short = np.arange(2).astype(object)
109109
self.vals_long = np.arange(10 ** 5).astype(object)
110110
# because of nans floats are special:
111-
self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(object)
112-
self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(object)
111+
self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float_)).astype(object)
112+
self.vals_long_floats = np.arange(10 ** 5, dtype=np.float_).astype(object)
113113

114114
def time_isin_nans(self):
115115
# if nan-objects are different objects,

ci/deps/actions-37-locale.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dependencies:
1111
- hypothesis>=3.58.0
1212

1313
# required
14-
- numpy
14+
- numpy<1.20 # GH#39541 compat for pyarrow<3
1515
- python-dateutil
1616
- pytz
1717

ci/deps/azure-37.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dependencies:
1818
- numpy
1919
- python-dateutil
2020
- nomkl
21-
- pyarrow
21+
- pyarrow=0.15.1
2222
- pytz
2323
- s3fs>=0.4.0
2424
- moto>=1.3.14

ci/deps/azure-38-locale.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ dependencies:
2424
- moto
2525
- nomkl
2626
- numexpr
27-
- numpy
27+
- numpy<1.20 # GH#39541 compat with pyarrow<3
2828
- openpyxl
2929
- pytables
3030
- python-dateutil

ci/deps/azure-macos-37.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies:
2121
- numexpr
2222
- numpy=1.16.5
2323
- openpyxl
24-
- pyarrow>=0.15.0
24+
- pyarrow=0.15.1
2525
- pytables
2626
- python-dateutil==2.7.3
2727
- pytz

doc/source/user_guide/enhancingperf.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,8 @@ in Python, so maybe we could minimize these by cythonizing the apply part.
199199
...: return s * dx
200200
...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b,
201201
...: np.ndarray col_N):
202-
...: assert (col_a.dtype == np.float
203-
...: and col_b.dtype == np.float and col_N.dtype == np.int)
202+
...: assert (col_a.dtype == np.float_
203+
...: and col_b.dtype == np.float_ and col_N.dtype == np.int_)
204204
...: cdef Py_ssize_t i, n = len(col_N)
205205
...: assert (len(col_a) == len(col_b) == n)
206206
...: cdef np.ndarray[double] res = np.empty(n)

doc/source/whatsnew/v0.8.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ New plotting methods
176176
Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot
177177
types. For example, ``'kde'`` is a new option:
178178

179-
.. code-block:: python
179+
.. ipython:: python
180180
181181
s = pd.Series(
182182
np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))

doc/source/whatsnew/v1.2.2.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17+
18+
- Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`)
19+
- Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
20+
- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`)
1721
- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
1822
- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
1923
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
@@ -26,7 +30,7 @@ Fixed regressions
2630
Bug fixes
2731
~~~~~~~~~
2832

29-
-
33+
- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
3034
-
3135

3236
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.3.0.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`DataFrame.plot.scatter` can now accept a categorical column as the argument to ``c`` (:issue:`12380`, :issue:`31357`)
5656
- :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes.
5757
- :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
58+
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
5859

5960
.. ---------------------------------------------------------------------------
6061
@@ -274,7 +275,6 @@ Datetimelike
274275
- Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`)
275276
- Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`)
276277
- Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
277-
- Bug in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
278278
- Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
279279
- Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`)
280280
- Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`)
@@ -284,7 +284,7 @@ Datetimelike
284284
Timedelta
285285
^^^^^^^^^
286286
- Bug in constructing :class:`Timedelta` from ``np.timedelta64`` objects with non-nanosecond units that are out of bounds for ``timedelta64[ns]`` (:issue:`38965`)
287-
-
287+
- Bug in constructing a :class:`TimedeltaIndex` incorrectly accepting ``np.datetime64("NaT")`` objects (:issue:`39462`)
288288
-
289289

290290
Timezones
@@ -376,6 +376,7 @@ I/O
376376
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
377377
- Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`)
378378
- :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`)
379+
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
379380

380381
Period
381382
^^^^^^
@@ -412,6 +413,7 @@ Reshaping
412413
- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
413414
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
414415
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns (:issue:`39464`)
416+
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
415417

416418
Sparse
417419
^^^^^^
@@ -432,7 +434,7 @@ Other
432434
- Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`)
433435
- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
434436
- Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`)
435-
-
437+
- Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
436438

437439
.. ---------------------------------------------------------------------------
438440

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ channels:
33
- conda-forge
44
dependencies:
55
# required
6-
- numpy>=1.16.5
6+
- numpy>=1.16.5, <1.20 # gh-39513
77
- python=3
88
- python-dateutil>=2.7.3
99
- pytz

pandas/_libs/hashtable.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
1919
are_equivalent_float64_t,
2020
are_equivalent_khcomplex64_t,
2121
are_equivalent_khcomplex128_t,
22+
kh_needed_n_buckets,
2223
kh_str_t,
2324
khcomplex64_t,
2425
khcomplex128_t,
@@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
152153
ndarray[int64_t, ndim=1] arr
153154
Int64VectorData *ud = idx.data
154155

155-
kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
156+
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
156157

157158
with nogil:
158159
for i in range(n):

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):
392392

393393
def __cinit__(self, int64_t size_hint=1):
394394
self.table = kh_init_{{dtype}}()
395-
if size_hint is not None:
396-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
397-
kh_resize_{{dtype}}(self.table, size_hint)
395+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
396+
kh_resize_{{dtype}}(self.table, size_hint)
398397

399398
def __len__(self) -> int:
400399
return self.table.size
@@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable):
420419
sizeof(Py_ssize_t)) # vals
421420
return overhead + for_flags + for_pairs
422421

422+
def get_state(self):
423+
""" returns infos about the state of the hashtable"""
424+
return {
425+
'n_buckets' : self.table.n_buckets,
426+
'size' : self.table.size,
427+
'n_occupied' : self.table.n_occupied,
428+
'upper_bound' : self.table.upper_bound,
429+
}
430+
423431
cpdef get_item(self, {{dtype}}_t val):
424432
cdef:
425433
khiter_t k
@@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable):
731739

732740
def __init__(self, int64_t size_hint=1):
733741
self.table = kh_init_str()
734-
if size_hint is not None:
735-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
736-
kh_resize_str(self.table, size_hint)
742+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
743+
kh_resize_str(self.table, size_hint)
737744

738745
def __dealloc__(self):
739746
if self.table is not NULL:
@@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable):
747754
sizeof(Py_ssize_t)) # vals
748755
return overhead + for_flags + for_pairs
749756

757+
def get_state(self):
758+
""" returns infos about the state of the hashtable"""
759+
return {
760+
'n_buckets' : self.table.n_buckets,
761+
'size' : self.table.size,
762+
'n_occupied' : self.table.n_occupied,
763+
'upper_bound' : self.table.upper_bound,
764+
}
765+
750766
cpdef get_item(self, str val):
751767
cdef:
752768
khiter_t k
@@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):
10441060

10451061
def __init__(self, int64_t size_hint=1):
10461062
self.table = kh_init_pymap()
1047-
if size_hint is not None:
1048-
size_hint = min(size_hint, SIZE_HINT_LIMIT)
1049-
kh_resize_pymap(self.table, size_hint)
1063+
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
1064+
kh_resize_pymap(self.table, size_hint)
10501065

10511066
def __dealloc__(self):
10521067
if self.table is not NULL:
@@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable):
10721087
sizeof(Py_ssize_t)) # vals
10731088
return overhead + for_flags + for_pairs
10741089

1090+
def get_state(self):
1091+
"""
1092+
returns infos about the current state of the hashtable like size,
1093+
number of buckets and so on.
1094+
"""
1095+
return {
1096+
'n_buckets' : self.table.n_buckets,
1097+
'size' : self.table.size,
1098+
'n_occupied' : self.table.n_occupied,
1099+
'upper_bound' : self.table.upper_bound,
1100+
}
1101+
10751102
cpdef get_item(self, object val):
10761103
cdef:
10771104
khiter_t k

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
121121
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
122122
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
123123

124-
kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
124+
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
125125

126126
if keep not in ('last', 'first', False):
127127
raise ValueError('keep must be either "first", "last" or False')

pandas/_libs/khash.pxd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,4 +120,7 @@ cdef extern from "khash_python.h":
120120

121121
bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
122122

123+
khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
124+
125+
123126
include "khash_for_primitive_helper.pxi"

pandas/_libs/src/klib/khash_python.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
244244
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
245245
kh_resize_str(table->table, val);
246246
}
247+
248+
// utility function: given the number of elements
249+
// returns number of necessary buckets
250+
khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
251+
khuint_t candidate = n_elements;
252+
kroundup32(candidate);
253+
khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
254+
return (upper_bound < n_elements) ? 2*candidate : candidate;
255+
256+
}

pandas/_libs/tslibs/nattype.pyx

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,6 @@ cdef class _NaT(datetime):
286286
def __hash__(self):
287287
return NPY_NAT
288288

289-
def __int__(self):
290-
return NPY_NAT
291-
292-
def __long__(self):
293-
return NPY_NAT
294-
295289
@property
296290
def is_leap_year(self) -> bool:
297291
return False

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -257,41 +257,37 @@ cdef convert_to_timedelta64(object ts, str unit):
257257
elif isinstance(ts, _Timedelta):
258258
# already in the proper format
259259
ts = np.timedelta64(ts.value, "ns")
260-
elif is_datetime64_object(ts):
261-
# only accept a NaT here
262-
if ts.astype('int64') == NPY_NAT:
263-
return np.timedelta64(NPY_NAT)
264260
elif is_timedelta64_object(ts):
265261
ts = ensure_td64ns(ts)
266262
elif is_integer_object(ts):
267263
if ts == NPY_NAT:
268264
return np.timedelta64(NPY_NAT, "ns")
269265
else:
270-
if unit in ['Y', 'M', 'W']:
266+
if unit in ["Y", "M", "W"]:
271267
ts = np.timedelta64(ts, unit)
272268
else:
273269
ts = cast_from_unit(ts, unit)
274270
ts = np.timedelta64(ts, "ns")
275271
elif is_float_object(ts):
276-
if unit in ['Y', 'M', 'W']:
272+
if unit in ["Y", "M", "W"]:
277273
ts = np.timedelta64(int(ts), unit)
278274
else:
279275
ts = cast_from_unit(ts, unit)
280276
ts = np.timedelta64(ts, "ns")
281277
elif isinstance(ts, str):
282-
if len(ts) > 0 and ts[0] == 'P':
278+
if len(ts) > 0 and ts[0] == "P":
283279
ts = parse_iso_format_string(ts)
284280
else:
285281
ts = parse_timedelta_string(ts)
286282
ts = np.timedelta64(ts, "ns")
287283
elif is_tick_object(ts):
288-
ts = np.timedelta64(ts.nanos, 'ns')
284+
ts = np.timedelta64(ts.nanos, "ns")
289285

290286
if PyDelta_Check(ts):
291-
ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns')
287+
ts = np.timedelta64(delta_to_nanoseconds(ts), "ns")
292288
elif not is_timedelta64_object(ts):
293289
raise ValueError(f"Invalid type for timedelta scalar: {type(ts)}")
294-
return ts.astype('timedelta64[ns]')
290+
return ts.astype("timedelta64[ns]")
295291

296292

297293
@cython.boundscheck(False)

pandas/_testing/asserters.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,13 +459,24 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
459459
):
460460
# np.nan
461461
return True
462+
elif (
463+
isinstance(left_attr, (np.datetime64, np.timedelta64))
464+
and isinstance(right_attr, (np.datetime64, np.timedelta64))
465+
and type(left_attr) is type(right_attr)
466+
and np.isnat(left_attr)
467+
and np.isnat(right_attr)
468+
):
469+
# np.datetime64("nat") or np.timedelta64("nat")
470+
return True
462471

463472
try:
464473
result = left_attr == right_attr
465474
except TypeError:
466475
# datetimetz on rhs may raise TypeError
467476
result = False
468-
if not isinstance(result, bool):
477+
if (left_attr is pd.NA) ^ (right_attr is pd.NA):
478+
result = False
479+
elif not isinstance(result, bool):
469480
result = result.all()
470481

471482
if result:

pandas/_typing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
Suffixes = Tuple[str, str]
9292
Ordered = Optional[bool]
9393
JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
94-
Axes = Collection
94+
Axes = Collection[Any]
9595

9696
# dtypes
9797
NpDtype = Union[str, np.dtype]

0 commit comments

Comments
 (0)