pandas-dev
diff --git a/‎.pre-commit-config.yaml
Lines changed: 9 additions & 3 deletions b/‎.pre-commit-config.yaml
Lines changed: 9 additions & 3 deletions
diff --git a/‎asv_bench/benchmarks/arithmetic.py
Lines changed: 9 additions & 11 deletions b/‎asv_bench/benchmarks/arithmetic.py
Lines changed: 9 additions & 11 deletions
diff --git a/‎asv_bench/benchmarks/sparse.py
Lines changed: 2 additions & 2 deletions b/‎asv_bench/benchmarks/sparse.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/source/ecosystem.rst
Lines changed: 12 additions & 11 deletions b/‎doc/source/ecosystem.rst
Lines changed: 12 additions & 11 deletions
diff --git a/‎doc/source/whatsnew/v1.2.4.rst
Lines changed: 1 addition & 0 deletions b/‎doc/source/whatsnew/v1.2.4.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/whatsnew/v1.3.0.rst
Lines changed: 2 additions & 0 deletions b/‎doc/source/whatsnew/v1.3.0.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎environment.yml
Lines changed: 1 addition & 0 deletions b/‎environment.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎pandas/_libs/algos.pyx
Lines changed: 47 additions & 27 deletions b/‎pandas/_libs/algos.pyx
Lines changed: 47 additions & 27 deletions
@@ -36,7 +36,7 @@ repos:
     rev: 3.9.0
     hooks:
     -   id: flake8
-        additional_dependencies: [flake8-comprehensions>=3.1.0]
+        additional_dependencies: [flake8-comprehensions>=3.1.0, flake8-bugbear>=21.3.2]
     -   id: flake8
         name: flake8 (cython)
         types: [cython]
@@ -86,11 +86,10 @@ repos:
         types: [python]
         exclude: ^pandas/_typing\.py$
     -   id: inconsistent-namespace-usage
-        name: 'Check for inconsistent use of pandas namespace in tests'
+        name: 'Check for inconsistent use of pandas namespace'
         entry: python scripts/check_for_inconsistent_pandas_namespace.py
         language: python
         types: [python]
-        files: ^pandas/tests/
     -   id: incorrect-code-directives
         name: Check for incorrect code block or IPython directives
         language: pygrep
@@ -213,3 +212,10 @@ repos:
             |\#\ type:\s?ignore(?!\[)
         language: pygrep
         types: [python]
+    -   id: use-pd_array-in-core
+        name: Import pandas.array as pd_array in core
+        language: python
+        entry: python scripts/use_pd_array_in_core.py
+        files: ^pandas/core/
+        exclude: ^pandas/core/api\.py$
+        types: [python]
@@ -140,9 +140,7 @@ def setup(self, op, shape):
         # construct dataframe with 2 blocks
         arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
         arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4")
-        df = pd.concat(
-            [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True
-        )
+        df = pd.concat([DataFrame(arr1), DataFrame(arr2)], axis=1, ignore_index=True)
         # should already be the case, but just to be sure
         df._consolidate_inplace()
 
@@ -151,7 +149,7 @@ def setup(self, op, shape):
         arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
         arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
         df2 = pd.concat(
-            [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
+            [DataFrame(arr1), DataFrame(arr2), DataFrame(arr3)],
             axis=1,
             ignore_index=True,
         )
@@ -459,9 +457,9 @@ class OffsetArrayArithmetic:
 
     def setup(self, offset):
         N = 10000
-        rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
+        rng = date_range(start="1/1/2000", periods=N, freq="T")
         self.rng = rng
-        self.ser = pd.Series(rng)
+        self.ser = Series(rng)
 
     def time_add_series_offset(self, offset):
         with warnings.catch_warnings(record=True):
@@ -478,7 +476,7 @@ class ApplyIndex:
 
     def setup(self, offset):
         N = 10000
-        rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
+        rng = date_range(start="1/1/2000", periods=N, freq="T")
         self.rng = rng
 
     def time_apply_index(self, offset):
@@ -490,17 +488,17 @@ class BinaryOpsMultiIndex:
     param_names = ["func"]
 
     def setup(self, func):
-        date_range = pd.date_range("20200101 00:00", "20200102 0:00", freq="S")
+        array = date_range("20200101 00:00", "20200102 0:00", freq="S")
         level_0_names = [str(i) for i in range(30)]
 
-        index = pd.MultiIndex.from_product([level_0_names, date_range])
+        index = pd.MultiIndex.from_product([level_0_names, array])
         column_names = ["col_1", "col_2"]
 
-        self.df = pd.DataFrame(
+        self.df = DataFrame(
             np.random.rand(len(index), 2), index=index, columns=column_names
         )
 
-        self.arg_df = pd.DataFrame(
+        self.arg_df = DataFrame(
             np.random.randint(1, 10, (len(level_0_names), 2)),
             index=level_0_names,
             columns=column_names,
 
@@ -28,7 +28,7 @@ def setup(self):
             data = np.random.randn(N)[:-i]
             idx = rng[:-i]
             data[100:] = np.nan
-            self.series[i] = pd.Series(pd.SparseArray(data), index=idx)
+            self.series[i] = Series(SparseArray(data), index=idx)
 
     def time_series_to_frame(self):
         pd.DataFrame(self.series)
@@ -63,7 +63,7 @@ def setup(self):
         )
 
     def time_sparse_series_from_coo(self):
-        pd.Series.sparse.from_coo(self.matrix)
+        Series.sparse.from_coo(self.matrix)
 
 
 class ToCoo:
 
@@ -475,7 +475,7 @@ arrays can be stored inside pandas' Series and DataFrame.
 `Pandas-Genomics`_
 ~~~~~~~~~~~~~~~~~~
 
-Pandas-Genomics provides extension types and extension arrays for working with genomics data
+Pandas-Genomics provides extension types, extension arrays, and extension accessors for working with genomics data
 
 `Pint-Pandas`_
 ~~~~~~~~~~~~~~
@@ -502,16 +502,17 @@ A directory of projects providing
 :ref:`extension accessors <extending.register-accessors>`. This is for users to
 discover new accessors and for library authors to coordinate on the namespace.
 
-=============== ============ ==================================== ===============================================================
-Library         Accessor     Classes                              Description
-=============== ============ ==================================== ===============================================================
-`cyberpandas`_  ``ip``       ``Series``                           Provides common operations for working with IP addresses.
-`pdvega`_       ``vgplot``   ``Series``, ``DataFrame``            Provides plotting functions from the Altair_ library.
-`pandas_path`_  ``path``     ``Index``, ``Series``                Provides `pathlib.Path`_ functions for Series.
-`pint-pandas`_  ``pint``     ``Series``, ``DataFrame``            Provides units support for numeric Series and DataFrames.
-`composeml`_    ``slice``    ``DataFrame``                        Provides a generator for enhanced data slicing.
-`datatest`_     ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers.
-=============== ============ ==================================== ===============================================================
+================== ============ ==================================== ===============================================================================
+Library            Accessor     Classes                              Description
+================== ============ ==================================== ===============================================================================
+`cyberpandas`_     ``ip``       ``Series``                           Provides common operations for working with IP addresses.
+`pdvega`_          ``vgplot``   ``Series``, ``DataFrame``            Provides plotting functions from the Altair_ library.
+`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame``            Provides common operations for quality control and analysis of genomics data
+`pandas_path`_     ``path``     ``Index``, ``Series``                Provides `pathlib.Path`_ functions for Series.
+`pint-pandas`_     ``pint``     ``Series``, ``DataFrame``            Provides units support for numeric Series and DataFrames.
+`composeml`_       ``slice``    ``DataFrame``                        Provides a generator for enhanced data slicing.
+`datatest`_        ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers.
+================== ============ ==================================== ===============================================================================
 
 .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
 .. _pdvega: https://altair-viz.github.io/pdvega/
 
@@ -17,6 +17,7 @@ Fixed regressions
 
 - Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`)
 - Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`)
+- Fixed regression in (in)equality comparison of ``pd.NaT`` with a non-datetimelike numpy array returning a scalar instead of an array (:issue:`40722`)
 - Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
 - Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`)
 -
 
@@ -161,6 +161,7 @@ Other enhancements
 - :meth:`.Styler.apply` now more consistently accepts ndarray function returns, i.e. in all cases for ``axis`` is ``0, 1 or None`` (:issue:`39359`)
 - :meth:`.Styler.apply` and :meth:`.Styler.applymap` now raise errors if wrong format CSS is passed on render (:issue:`39660`)
 - :meth:`.Styler.format` adds keyword argument ``escape`` for optional HTML escaping (:issue:`40437`)
+- :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`)
 - Builtin highlighting methods in :class:`Styler` have a more consistent signature and css customisability (:issue:`40242`)
 - :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
 - :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
@@ -561,6 +562,7 @@ Numeric
 - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
 - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
+- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
 -
 
 Conversion
 
@@ -21,6 +21,7 @@ dependencies:
   - black=20.8b1
   - cpplint
   - flake8
+  - flake8-bugbear>=21.3.2  # used by flake8, find likely bugs
   - flake8-comprehensions>=3.1.0  # used by flake8, linting of unnecessary comprehensions
   - isort>=5.2.1  # check that imports are in the right order
   - mypy=0.812
 
@@ -947,12 +947,14 @@ def rank_1d(
         TiebreakEnumType tiebreak
         Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
         Py_ssize_t grp_vals_seen=1, grp_na_count=0
-        ndarray[int64_t, ndim=1] lexsort_indexer
-        ndarray[float64_t, ndim=1] grp_sizes, out
+        ndarray[int64_t, ndim=1] grp_sizes
+        ndarray[intp_t, ndim=1] lexsort_indexer
+        ndarray[float64_t, ndim=1] out
         ndarray[rank_t, ndim=1] masked_vals
         ndarray[uint8_t, ndim=1] mask
         bint keep_na, at_end, next_val_diff, check_labels, group_changed
         rank_t nan_fill_val
+        int64_t grp_size
 
     tiebreak = tiebreakers[ties_method]
     if tiebreak == TIEBREAK_FIRST:
@@ -965,7 +967,7 @@ def rank_1d(
     # TODO Cython 3.0: cast won't be necessary (#2992)
     assert <Py_ssize_t>len(labels) == N
     out = np.empty(N)
-    grp_sizes = np.ones(N)
+    grp_sizes = np.ones(N, dtype=np.int64)
 
     # If all 0 labels, can short-circuit later label
     # comparisons
@@ -1022,7 +1024,7 @@ def rank_1d(
     # each label corresponds to a different group value,
     # the mask helps you differentiate missing values before
     # performing sort on the actual values
-    lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False)
+    lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
 
     if not ascending:
         lexsort_indexer = lexsort_indexer[::-1]
@@ -1093,13 +1095,15 @@ def rank_1d(
                     for j in range(i - dups + 1, i + 1):
                         out[lexsort_indexer[j]] = grp_vals_seen
 
-                # Look forward to the next value (using the sorting in lexsort_indexer)
-                # if the value does not equal the current value then we need to
-                # reset the dups and sum_ranks, knowing that a new value is
-                # coming up. The conditional also needs to handle nan equality
-                # and the end of iteration
-                if next_val_diff or (mask[lexsort_indexer[i]]
-                                     ^ mask[lexsort_indexer[i+1]]):
+                # Look forward to the next value (using the sorting in
+                # lexsort_indexer). If the value does not equal the current
+                # value then we need to reset the dups and sum_ranks, knowing
+                # that a new value is coming up. The conditional also needs
+                # to handle nan equality and the end of iteration. If group
+                # changes we do not record seeing a new value in the group
+                if not group_changed and (next_val_diff or
+                                          (mask[lexsort_indexer[i]]
+                                           ^ mask[lexsort_indexer[i+1]])):
                     dups = sum_ranks = 0
                     grp_vals_seen += 1
 
@@ -1110,14 +1114,21 @@ def rank_1d(
                 # group encountered (used by pct calculations later). Also be
                 # sure to reset any of the items helping to calculate dups
                 if group_changed:
+
+                    # If not dense tiebreak, group size used to compute
+                    # percentile will be # of non-null elements in group
                     if tiebreak != TIEBREAK_DENSE:
-                        for j in range(grp_start, i + 1):
-                            grp_sizes[lexsort_indexer[j]] = \
-                                (i - grp_start + 1 - grp_na_count)
+                        grp_size = i - grp_start + 1 - grp_na_count
+
+                    # Otherwise, it will be the number of distinct values
+                    # in the group, subtracting 1 if NaNs are present
+                    # since that is a distinct value we shouldn't count
                     else:
-                        for j in range(grp_start, i + 1):
-                            grp_sizes[lexsort_indexer[j]] = \
-                                (grp_vals_seen - 1 - (grp_na_count > 0))
+                        grp_size = grp_vals_seen - (grp_na_count > 0)
+
+                    for j in range(grp_start, i + 1):
+                        grp_sizes[lexsort_indexer[j]] = grp_size
+
                     dups = sum_ranks = 0
                     grp_na_count = 0
                     grp_start = i + 1
@@ -1184,12 +1195,14 @@ def rank_1d(
                             out[lexsort_indexer[j]] = grp_vals_seen
 
                     # Look forward to the next value (using the sorting in
-                    # lexsort_indexer) if the value does not equal the current
+                    # lexsort_indexer). If the value does not equal the current
                     # value then we need to reset the dups and sum_ranks, knowing
                     # that a new value is coming up. The conditional also needs
-                    # to handle nan equality and the end of iteration
-                    if next_val_diff or (mask[lexsort_indexer[i]]
-                                         ^ mask[lexsort_indexer[i+1]]):
+                    # to handle nan equality and the end of iteration. If group
+                    # changes we do not record seeing a new value in the group
+                    if not group_changed and (next_val_diff or
+                                              (mask[lexsort_indexer[i]]
+                                               ^ mask[lexsort_indexer[i+1]])):
                         dups = sum_ranks = 0
                         grp_vals_seen += 1
 
@@ -1200,14 +1213,21 @@ def rank_1d(
                     # group encountered (used by pct calculations later). Also be
                     # sure to reset any of the items helping to calculate dups
                     if group_changed:
+
+                        # If not dense tiebreak, group size used to compute
+                        # percentile will be # of non-null elements in group
                         if tiebreak != TIEBREAK_DENSE:
-                            for j in range(grp_start, i + 1):
-                                grp_sizes[lexsort_indexer[j]] = \
-                                    (i - grp_start + 1 - grp_na_count)
+                            grp_size = i - grp_start + 1 - grp_na_count
+
+                        # Otherwise, it will be the number of distinct values
+                        # in the group, subtracting 1 if NaNs are present
+                        # since that is a distinct value we shouldn't count
                         else:
-                            for j in range(grp_start, i + 1):
-                                grp_sizes[lexsort_indexer[j]] = \
-                                    (grp_vals_seen - 1 - (grp_na_count > 0))
+                            grp_size = grp_vals_seen - (grp_na_count > 0)
+
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[lexsort_indexer[j]] = grp_size
+
                         dups = sum_ranks = 0
                         grp_na_count = 0
                         grp_start = i + 1
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ Fixed regressions`
`17`	`17`
`18`	`18`	- Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`)
`19`	`19`	- Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`)
	`20`	+- Fixed regression in (in)equality comparison of ``pd.NaT`` with a non-datetimelike numpy array returning a scalar instead of an array (:issue:`40722`)
`20`	`21`	- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
`21`	`22`	- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`)
`22`	`23`	`-`