pandas-dev
diff --git a/‎.github/workflows/comment_bot.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/comment_bot.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎asv_bench/benchmarks/series_methods.py
Lines changed: 2 additions & 2 deletions b/‎asv_bench/benchmarks/series_methods.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎ci/deps/actions-37-locale.yaml
Lines changed: 1 addition & 1 deletion b/‎ci/deps/actions-37-locale.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/deps/azure-37.yaml
Lines changed: 1 addition & 1 deletion b/‎ci/deps/azure-37.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/deps/azure-38-locale.yaml
Lines changed: 1 addition & 1 deletion b/‎ci/deps/azure-38-locale.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/deps/azure-macos-37.yaml
Lines changed: 1 addition & 1 deletion b/‎ci/deps/azure-macos-37.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/source/user_guide/enhancingperf.rst
Lines changed: 2 additions & 2 deletions b/‎doc/source/user_guide/enhancingperf.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/source/whatsnew/v0.8.0.rst
Lines changed: 1 addition & 1 deletion b/‎doc/source/whatsnew/v0.8.0.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/source/whatsnew/v1.2.2.rst
Lines changed: 5 additions & 1 deletion b/‎doc/source/whatsnew/v1.2.2.rst
Lines changed: 5 additions & 1 deletion
diff --git a/‎doc/source/whatsnew/v1.3.0.rst
Lines changed: 5 additions & 3 deletions b/‎doc/source/whatsnew/v1.3.0.rst
Lines changed: 5 additions & 3 deletions
diff --git a/‎environment.yml
Lines changed: 1 addition & 1 deletion b/‎environment.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/_libs/hashtable.pyx
Lines changed: 2 additions & 1 deletion b/‎pandas/_libs/hashtable.pyx
Lines changed: 2 additions & 1 deletion
diff --git a/‎pandas/_libs/hashtable_class_helper.pxi.in
Lines changed: 36 additions & 9 deletions b/‎pandas/_libs/hashtable_class_helper.pxi.in
Lines changed: 36 additions & 9 deletions
diff --git a/‎pandas/_libs/hashtable_func_helper.pxi.in
Lines changed: 1 addition & 1 deletion b/‎pandas/_libs/hashtable_func_helper.pxi.in
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/_libs/khash.pxd
Lines changed: 3 additions & 0 deletions b/‎pandas/_libs/khash.pxd
Lines changed: 3 additions & 0 deletions
diff --git a/‎pandas/_libs/src/klib/khash_python.h
Lines changed: 10 additions & 0 deletions b/‎pandas/_libs/src/klib/khash_python.h
Lines changed: 10 additions & 0 deletions
diff --git a/‎pandas/_libs/tslibs/nattype.pyx
Lines changed: 0 additions & 6 deletions b/‎pandas/_libs/tslibs/nattype.pyx
Lines changed: 0 additions & 6 deletions
diff --git a/‎pandas/_libs/tslibs/timedeltas.pyx
Lines changed: 6 additions & 10 deletions b/‎pandas/_libs/tslibs/timedeltas.pyx
Lines changed: 6 additions & 10 deletions
diff --git a/‎pandas/_testing/asserters.py
Lines changed: 12 additions & 1 deletion b/‎pandas/_testing/asserters.py
Lines changed: 12 additions & 1 deletion
diff --git a/‎pandas/_typing.py
Lines changed: 1 addition & 1 deletion b/‎pandas/_typing.py
Lines changed: 1 addition & 1 deletion
@@ -29,7 +29,7 @@ jobs:
       - name: Install-pre-commit
         run: python -m pip install --upgrade pre-commit
       - name: Run pre-commit
-        run: pre-commit run --all-files || (exit 0)
+        run: pre-commit run --from-ref=origin/master --to-ref=HEAD --all-files || (exit 0)
       - name: Commit results
         run: |
           git config user.name "$(git log -1 --pretty=format:%an)"
 
@@ -108,8 +108,8 @@ def setup(self):
         self.vals_short = np.arange(2).astype(object)
         self.vals_long = np.arange(10 ** 5).astype(object)
         # because of nans floats are special:
-        self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(object)
-        self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(object)
+        self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float_)).astype(object)
+        self.vals_long_floats = np.arange(10 ** 5, dtype=np.float_).astype(object)
 
     def time_isin_nans(self):
         # if nan-objects are different objects,
 
@@ -11,7 +11,7 @@ dependencies:
   - hypothesis>=3.58.0
 
   # required
-  - numpy
+  - numpy<1.20  # GH#39541 compat for pyarrow<3
   - python-dateutil
   - pytz
 
 
@@ -18,7 +18,7 @@ dependencies:
   - numpy
   - python-dateutil
   - nomkl
-  - pyarrow
+  - pyarrow=0.15.1
   - pytz
   - s3fs>=0.4.0
   - moto>=1.3.14
 
@@ -24,7 +24,7 @@ dependencies:
   - moto
   - nomkl
   - numexpr
-  - numpy
+  - numpy<1.20  # GH#39541 compat with pyarrow<3
   - openpyxl
   - pytables
   - python-dateutil
 
@@ -21,7 +21,7 @@ dependencies:
   - numexpr
   - numpy=1.16.5
   - openpyxl
-  - pyarrow>=0.15.0
+  - pyarrow=0.15.1
   - pytables
   - python-dateutil==2.7.3
   - pytz
 
@@ -199,8 +199,8 @@ in Python, so maybe we could minimize these by cythonizing the apply part.
       ...:     return s * dx
       ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b,
       ...:                                            np.ndarray col_N):
-      ...:     assert (col_a.dtype == np.float
-      ...:             and col_b.dtype == np.float and col_N.dtype == np.int)
+      ...:     assert (col_a.dtype == np.float_
+      ...:             and col_b.dtype == np.float_ and col_N.dtype == np.int_)
       ...:     cdef Py_ssize_t i, n = len(col_N)
       ...:     assert (len(col_a) == len(col_b) == n)
       ...:     cdef np.ndarray[double] res = np.empty(n)
 
@@ -176,7 +176,7 @@ New plotting methods
 Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot
 types. For example, ``'kde'`` is a new option:
 
-.. code-block:: python
+.. ipython:: python
 
    s = pd.Series(
        np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))
 
@@ -14,6 +14,10 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+
+- Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`)
+- Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
+- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`)
 - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
 - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
 - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
@@ -26,7 +30,7 @@ Fixed regressions
 Bug fixes
 ~~~~~~~~~
 
--
+- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
 -
 
 .. ---------------------------------------------------------------------------
 
@@ -55,6 +55,7 @@ Other enhancements
 - :meth:`DataFrame.plot.scatter` can now accept a categorical column as the argument to ``c`` (:issue:`12380`, :issue:`31357`)
 - :meth:`.Styler.set_tooltips` allows on hover tooltips to be added to styled HTML dataframes.
 - :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
+- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
 
 .. ---------------------------------------------------------------------------
 
@@ -274,7 +275,6 @@ Datetimelike
 - Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`)
 - Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`)
 - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
-- Bug in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
 - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
 - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`)
 - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`)
@@ -284,7 +284,7 @@ Datetimelike
 Timedelta
 ^^^^^^^^^
 - Bug in constructing :class:`Timedelta` from ``np.timedelta64`` objects with non-nanosecond units that are out of bounds for ``timedelta64[ns]`` (:issue:`38965`)
--
+- Bug in constructing a :class:`TimedeltaIndex` incorrectly accepting ``np.datetime64("NaT")`` objects (:issue:`39462`)
 -
 
 Timezones
@@ -376,6 +376,7 @@ I/O
 - Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
 - Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`)
 - :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`)
+- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
 
 Period
 ^^^^^^
@@ -412,6 +413,7 @@ Reshaping
 - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
 - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
 - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns (:issue:`39464`)
+- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
 
 Sparse
 ^^^^^^
@@ -432,7 +434,7 @@ Other
 - Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`)
 - Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
 - Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`)
--
+- Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
 
 .. ---------------------------------------------------------------------------
 
 
@@ -3,7 +3,7 @@ channels:
   - conda-forge
 dependencies:
   # required
-  - numpy>=1.16.5
+  - numpy>=1.16.5, <1.20 # gh-39513
   - python=3
   - python-dateutil>=2.7.3
   - pytz
 
@@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
     are_equivalent_float64_t,
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
+    kh_needed_n_buckets,
     kh_str_t,
     khcomplex64_t,
     khcomplex128_t,
@@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
         ndarray[int64_t, ndim=1] arr
         Int64VectorData *ud = idx.data
 
-    kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
+    kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     with nogil:
         for i in range(n):
 
@@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):
 
     def __cinit__(self, int64_t size_hint=1):
         self.table = kh_init_{{dtype}}()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_{{dtype}}(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_{{dtype}}(self.table, size_hint)
 
     def __len__(self) -> int:
         return self.table.size
@@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """ returns infos about the state of the hashtable"""
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, {{dtype}}_t val):
         cdef:
             khiter_t k
@@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable):
 
     def __init__(self, int64_t size_hint=1):
         self.table = kh_init_str()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_str(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_str(self.table, size_hint)
 
     def __dealloc__(self):
         if self.table is not NULL:
@@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """ returns infos about the state of the hashtable"""
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, str val):
         cdef:
             khiter_t k
@@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):
 
     def __init__(self, int64_t size_hint=1):
         self.table = kh_init_pymap()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_pymap(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_pymap(self.table, size_hint)
 
     def __dealloc__(self):
         if self.table is not NULL:
@@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """
+        returns infos about the current state of the hashtable like size,
+        number of buckets and so on.
+        """
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, object val):
         cdef:
             khiter_t k
 
@@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         kh_{{ttype}}_t *table = kh_init_{{ttype}}()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
 
-    kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
+    kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     if keep not in ('last', 'first', False):
         raise ValueError('keep must be either "first", "last" or False')
 
@@ -120,4 +120,7 @@ cdef extern from "khash_python.h":
 
     bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
 
+    khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
+
+
 include "khash_for_primitive_helper.pxi"
@@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
 void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
 	kh_resize_str(table->table, val);
 }
+
+// utility function: given the number of elements
+// returns number of necessary buckets
+khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
+    khuint_t candidate = n_elements;
+    kroundup32(candidate);
+    khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
+    return (upper_bound < n_elements) ? 2*candidate : candidate;
+
+}
@@ -286,12 +286,6 @@ cdef class _NaT(datetime):
     def __hash__(self):
         return NPY_NAT
 
-    def __int__(self):
-        return NPY_NAT
-
-    def __long__(self):
-        return NPY_NAT
-
     @property
     def is_leap_year(self) -> bool:
         return False
 
@@ -257,41 +257,37 @@ cdef convert_to_timedelta64(object ts, str unit):
     elif isinstance(ts, _Timedelta):
         # already in the proper format
         ts = np.timedelta64(ts.value, "ns")
-    elif is_datetime64_object(ts):
-        # only accept a NaT here
-        if ts.astype('int64') == NPY_NAT:
-            return np.timedelta64(NPY_NAT)
     elif is_timedelta64_object(ts):
         ts = ensure_td64ns(ts)
     elif is_integer_object(ts):
         if ts == NPY_NAT:
             return np.timedelta64(NPY_NAT, "ns")
         else:
-            if unit in ['Y', 'M', 'W']:
+            if unit in ["Y", "M", "W"]:
                 ts = np.timedelta64(ts, unit)
             else:
                 ts = cast_from_unit(ts, unit)
                 ts = np.timedelta64(ts, "ns")
     elif is_float_object(ts):
-        if unit in ['Y', 'M', 'W']:
+        if unit in ["Y", "M", "W"]:
             ts = np.timedelta64(int(ts), unit)
         else:
             ts = cast_from_unit(ts, unit)
             ts = np.timedelta64(ts, "ns")
     elif isinstance(ts, str):
-        if len(ts) > 0 and ts[0] == 'P':
+        if len(ts) > 0 and ts[0] == "P":
             ts = parse_iso_format_string(ts)
         else:
             ts = parse_timedelta_string(ts)
         ts = np.timedelta64(ts, "ns")
     elif is_tick_object(ts):
-        ts = np.timedelta64(ts.nanos, 'ns')
+        ts = np.timedelta64(ts.nanos, "ns")
 
     if PyDelta_Check(ts):
-        ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns')
+        ts = np.timedelta64(delta_to_nanoseconds(ts), "ns")
     elif not is_timedelta64_object(ts):
         raise ValueError(f"Invalid type for timedelta scalar: {type(ts)}")
-    return ts.astype('timedelta64[ns]')
+    return ts.astype("timedelta64[ns]")
 
 
 @cython.boundscheck(False)
 
@@ -459,13 +459,24 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
     ):
         # np.nan
         return True
+    elif (
+        isinstance(left_attr, (np.datetime64, np.timedelta64))
+        and isinstance(right_attr, (np.datetime64, np.timedelta64))
+        and type(left_attr) is type(right_attr)
+        and np.isnat(left_attr)
+        and np.isnat(right_attr)
+    ):
+        # np.datetime64("nat") or np.timedelta64("nat")
+        return True
 
     try:
         result = left_attr == right_attr
     except TypeError:
         # datetimetz on rhs may raise TypeError
         result = False
-    if not isinstance(result, bool):
+    if (left_attr is pd.NA) ^ (right_attr is pd.NA):
+        result = False
+    elif not isinstance(result, bool):
         result = result.all()
 
     if result:
 
@@ -91,7 +91,7 @@
 Suffixes = Tuple[str, str]
 Ordered = Optional[bool]
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
-Axes = Collection
+Axes = Collection[Any]
 
 # dtypes
 NpDtype = Union[str, np.dtype]