Skip to content

Commit b90967d

Browse files
Merge remote-tracking branch 'upstream/master' into fix-20432
2 parents b282327 + cc84a23 commit b90967d

File tree

88 files changed

+1305
-779
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+1305
-779
lines changed

.pre-commit-config.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ repos:
3636
rev: 3.9.0
3737
hooks:
3838
- id: flake8
39-
additional_dependencies: [flake8-comprehensions>=3.1.0]
39+
additional_dependencies: [flake8-comprehensions>=3.1.0, flake8-bugbear>=21.3.2]
4040
- id: flake8
4141
name: flake8 (cython)
4242
types: [cython]
@@ -86,11 +86,10 @@ repos:
8686
types: [python]
8787
exclude: ^pandas/_typing\.py$
8888
- id: inconsistent-namespace-usage
89-
name: 'Check for inconsistent use of pandas namespace in tests'
89+
name: 'Check for inconsistent use of pandas namespace'
9090
entry: python scripts/check_for_inconsistent_pandas_namespace.py
9191
language: python
9292
types: [python]
93-
files: ^pandas/tests/
9493
- id: incorrect-code-directives
9594
name: Check for incorrect code block or IPython directives
9695
language: pygrep
@@ -213,3 +212,10 @@ repos:
213212
|\#\ type:\s?ignore(?!\[)
214213
language: pygrep
215214
types: [python]
215+
- id: use-pd_array-in-core
216+
name: Import pandas.array as pd_array in core
217+
language: python
218+
entry: python scripts/use_pd_array_in_core.py
219+
files: ^pandas/core/
220+
exclude: ^pandas/core/api\.py$
221+
types: [python]

asv_bench/benchmarks/arithmetic.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,7 @@ def setup(self, op, shape):
140140
# construct dataframe with 2 blocks
141141
arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
142142
arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4")
143-
df = pd.concat(
144-
[pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True
145-
)
143+
df = pd.concat([DataFrame(arr1), DataFrame(arr2)], axis=1, ignore_index=True)
146144
# should already be the case, but just to be sure
147145
df._consolidate_inplace()
148146

@@ -151,7 +149,7 @@ def setup(self, op, shape):
151149
arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
152150
arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
153151
df2 = pd.concat(
154-
[pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
152+
[DataFrame(arr1), DataFrame(arr2), DataFrame(arr3)],
155153
axis=1,
156154
ignore_index=True,
157155
)
@@ -459,9 +457,9 @@ class OffsetArrayArithmetic:
459457

460458
def setup(self, offset):
461459
N = 10000
462-
rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
460+
rng = date_range(start="1/1/2000", periods=N, freq="T")
463461
self.rng = rng
464-
self.ser = pd.Series(rng)
462+
self.ser = Series(rng)
465463

466464
def time_add_series_offset(self, offset):
467465
with warnings.catch_warnings(record=True):
@@ -478,7 +476,7 @@ class ApplyIndex:
478476

479477
def setup(self, offset):
480478
N = 10000
481-
rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
479+
rng = date_range(start="1/1/2000", periods=N, freq="T")
482480
self.rng = rng
483481

484482
def time_apply_index(self, offset):
@@ -490,17 +488,17 @@ class BinaryOpsMultiIndex:
490488
param_names = ["func"]
491489

492490
def setup(self, func):
493-
date_range = pd.date_range("20200101 00:00", "20200102 0:00", freq="S")
491+
array = date_range("20200101 00:00", "20200102 0:00", freq="S")
494492
level_0_names = [str(i) for i in range(30)]
495493

496-
index = pd.MultiIndex.from_product([level_0_names, date_range])
494+
index = pd.MultiIndex.from_product([level_0_names, array])
497495
column_names = ["col_1", "col_2"]
498496

499-
self.df = pd.DataFrame(
497+
self.df = DataFrame(
500498
np.random.rand(len(index), 2), index=index, columns=column_names
501499
)
502500

503-
self.arg_df = pd.DataFrame(
501+
self.arg_df = DataFrame(
504502
np.random.randint(1, 10, (len(level_0_names), 2)),
505503
index=level_0_names,
506504
columns=column_names,

asv_bench/benchmarks/sparse.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def setup(self):
2828
data = np.random.randn(N)[:-i]
2929
idx = rng[:-i]
3030
data[100:] = np.nan
31-
self.series[i] = pd.Series(pd.SparseArray(data), index=idx)
31+
self.series[i] = Series(SparseArray(data), index=idx)
3232

3333
def time_series_to_frame(self):
3434
pd.DataFrame(self.series)
@@ -63,7 +63,7 @@ def setup(self):
6363
)
6464

6565
def time_sparse_series_from_coo(self):
66-
pd.Series.sparse.from_coo(self.matrix)
66+
Series.sparse.from_coo(self.matrix)
6767

6868

6969
class ToCoo:

doc/source/ecosystem.rst

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ arrays can be stored inside pandas' Series and DataFrame.
475475
`Pandas-Genomics`_
476476
~~~~~~~~~~~~~~~~~~
477477

478-
Pandas-Genomics provides extension types and extension arrays for working with genomics data
478+
Pandas-Genomics provides extension types, extension arrays, and extension accessors for working with genomics data
479479

480480
`Pint-Pandas`_
481481
~~~~~~~~~~~~~~
@@ -502,16 +502,17 @@ A directory of projects providing
502502
:ref:`extension accessors <extending.register-accessors>`. This is for users to
503503
discover new accessors and for library authors to coordinate on the namespace.
504504

505-
=============== ============ ==================================== ===============================================================
506-
Library Accessor Classes Description
507-
=============== ============ ==================================== ===============================================================
508-
`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses.
509-
`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library.
510-
`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series.
511-
`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames.
512-
`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing.
513-
`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers.
514-
=============== ============ ==================================== ===============================================================
505+
================== ============ ==================================== ===============================================================================
506+
Library Accessor Classes Description
507+
================== ============ ==================================== ===============================================================================
508+
`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses.
509+
`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library.
510+
`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data
511+
`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series.
512+
`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames.
513+
`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing.
514+
`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers.
515+
================== ============ ==================================== ===============================================================================
515516

516517
.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
517518
.. _pdvega: https://altair-viz.github.io/pdvega/

doc/source/whatsnew/v1.2.4.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717

1818
- Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`)
1919
- Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`)
20+
- Fixed regression in (in)equality comparison of ``pd.NaT`` with a non-datetimelike numpy array returning a scalar instead of an array (:issue:`40722`)
2021
- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
2122
- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`)
2223
-

doc/source/whatsnew/v1.3.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ Other enhancements
161161
- :meth:`.Styler.apply` now more consistently accepts ndarray function returns, i.e. in all cases for ``axis`` is ``0, 1 or None`` (:issue:`39359`)
162162
- :meth:`.Styler.apply` and :meth:`.Styler.applymap` now raise errors if wrong format CSS is passed on render (:issue:`39660`)
163163
- :meth:`.Styler.format` adds keyword argument ``escape`` for optional HTML escaping (:issue:`40437`)
164+
- :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`)
164165
- Builtin highlighting methods in :class:`Styler` have a more consistent signature and css customisability (:issue:`40242`)
165166
- :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
166167
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
@@ -561,6 +562,7 @@ Numeric
561562
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
562563
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
563564
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
565+
- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
564566
-
565567

566568
Conversion

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ dependencies:
2121
- black=20.8b1
2222
- cpplint
2323
- flake8
24+
- flake8-bugbear>=21.3.2 # used by flake8, find likely bugs
2425
- flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions
2526
- isort>=5.2.1 # check that imports are in the right order
2627
- mypy=0.812

pandas/_libs/algos.pyx

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -947,12 +947,14 @@ def rank_1d(
947947
TiebreakEnumType tiebreak
948948
Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
949949
Py_ssize_t grp_vals_seen=1, grp_na_count=0
950-
ndarray[int64_t, ndim=1] lexsort_indexer
951-
ndarray[float64_t, ndim=1] grp_sizes, out
950+
ndarray[int64_t, ndim=1] grp_sizes
951+
ndarray[intp_t, ndim=1] lexsort_indexer
952+
ndarray[float64_t, ndim=1] out
952953
ndarray[rank_t, ndim=1] masked_vals
953954
ndarray[uint8_t, ndim=1] mask
954955
bint keep_na, at_end, next_val_diff, check_labels, group_changed
955956
rank_t nan_fill_val
957+
int64_t grp_size
956958

957959
tiebreak = tiebreakers[ties_method]
958960
if tiebreak == TIEBREAK_FIRST:
@@ -965,7 +967,7 @@ def rank_1d(
965967
# TODO Cython 3.0: cast won't be necessary (#2992)
966968
assert <Py_ssize_t>len(labels) == N
967969
out = np.empty(N)
968-
grp_sizes = np.ones(N)
970+
grp_sizes = np.ones(N, dtype=np.int64)
969971

970972
# If all 0 labels, can short-circuit later label
971973
# comparisons
@@ -1022,7 +1024,7 @@ def rank_1d(
10221024
# each label corresponds to a different group value,
10231025
# the mask helps you differentiate missing values before
10241026
# performing sort on the actual values
1025-
lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False)
1027+
lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
10261028

10271029
if not ascending:
10281030
lexsort_indexer = lexsort_indexer[::-1]
@@ -1093,13 +1095,15 @@ def rank_1d(
10931095
for j in range(i - dups + 1, i + 1):
10941096
out[lexsort_indexer[j]] = grp_vals_seen
10951097

1096-
# Look forward to the next value (using the sorting in lexsort_indexer)
1097-
# if the value does not equal the current value then we need to
1098-
# reset the dups and sum_ranks, knowing that a new value is
1099-
# coming up. The conditional also needs to handle nan equality
1100-
# and the end of iteration
1101-
if next_val_diff or (mask[lexsort_indexer[i]]
1102-
^ mask[lexsort_indexer[i+1]]):
1098+
# Look forward to the next value (using the sorting in
1099+
# lexsort_indexer). If the value does not equal the current
1100+
# value then we need to reset the dups and sum_ranks, knowing
1101+
# that a new value is coming up. The conditional also needs
1102+
# to handle nan equality and the end of iteration. If group
1103+
# changes we do not record seeing a new value in the group
1104+
if not group_changed and (next_val_diff or
1105+
(mask[lexsort_indexer[i]]
1106+
^ mask[lexsort_indexer[i+1]])):
11031107
dups = sum_ranks = 0
11041108
grp_vals_seen += 1
11051109

@@ -1110,14 +1114,21 @@ def rank_1d(
11101114
# group encountered (used by pct calculations later). Also be
11111115
# sure to reset any of the items helping to calculate dups
11121116
if group_changed:
1117+
1118+
# If not dense tiebreak, group size used to compute
1119+
# percentile will be # of non-null elements in group
11131120
if tiebreak != TIEBREAK_DENSE:
1114-
for j in range(grp_start, i + 1):
1115-
grp_sizes[lexsort_indexer[j]] = \
1116-
(i - grp_start + 1 - grp_na_count)
1121+
grp_size = i - grp_start + 1 - grp_na_count
1122+
1123+
# Otherwise, it will be the number of distinct values
1124+
# in the group, subtracting 1 if NaNs are present
1125+
# since that is a distinct value we shouldn't count
11171126
else:
1118-
for j in range(grp_start, i + 1):
1119-
grp_sizes[lexsort_indexer[j]] = \
1120-
(grp_vals_seen - 1 - (grp_na_count > 0))
1127+
grp_size = grp_vals_seen - (grp_na_count > 0)
1128+
1129+
for j in range(grp_start, i + 1):
1130+
grp_sizes[lexsort_indexer[j]] = grp_size
1131+
11211132
dups = sum_ranks = 0
11221133
grp_na_count = 0
11231134
grp_start = i + 1
@@ -1184,12 +1195,14 @@ def rank_1d(
11841195
out[lexsort_indexer[j]] = grp_vals_seen
11851196

11861197
# Look forward to the next value (using the sorting in
1187-
# lexsort_indexer) if the value does not equal the current
1198+
# lexsort_indexer). If the value does not equal the current
11881199
# value then we need to reset the dups and sum_ranks, knowing
11891200
# that a new value is coming up. The conditional also needs
1190-
# to handle nan equality and the end of iteration
1191-
if next_val_diff or (mask[lexsort_indexer[i]]
1192-
^ mask[lexsort_indexer[i+1]]):
1201+
# to handle nan equality and the end of iteration. If group
1202+
# changes we do not record seeing a new value in the group
1203+
if not group_changed and (next_val_diff or
1204+
(mask[lexsort_indexer[i]]
1205+
^ mask[lexsort_indexer[i+1]])):
11931206
dups = sum_ranks = 0
11941207
grp_vals_seen += 1
11951208

@@ -1200,14 +1213,21 @@ def rank_1d(
12001213
# group encountered (used by pct calculations later). Also be
12011214
# sure to reset any of the items helping to calculate dups
12021215
if group_changed:
1216+
1217+
# If not dense tiebreak, group size used to compute
1218+
# percentile will be # of non-null elements in group
12031219
if tiebreak != TIEBREAK_DENSE:
1204-
for j in range(grp_start, i + 1):
1205-
grp_sizes[lexsort_indexer[j]] = \
1206-
(i - grp_start + 1 - grp_na_count)
1220+
grp_size = i - grp_start + 1 - grp_na_count
1221+
1222+
# Otherwise, it will be the number of distinct values
1223+
# in the group, subtracting 1 if NaNs are present
1224+
# since that is a distinct value we shouldn't count
12071225
else:
1208-
for j in range(grp_start, i + 1):
1209-
grp_sizes[lexsort_indexer[j]] = \
1210-
(grp_vals_seen - 1 - (grp_na_count > 0))
1226+
grp_size = grp_vals_seen - (grp_na_count > 0)
1227+
1228+
for j in range(grp_start, i + 1):
1229+
grp_sizes[lexsort_indexer[j]] = grp_size
1230+
12111231
dups = sum_ranks = 0
12121232
grp_na_count = 0
12131233
grp_start = i + 1

0 commit comments

Comments
 (0)