Merge pull request #141 from pandas-dev/master

sthagen · web-flow · commit 65b7536c926c · 2021-03-13T11:33:56.000+01:00
Sync Fork from Upstream Repo
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1634,10 +1634,10 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
     Parameters
     ----------
-    arr : ndarray
+    arr : ndarray or ExtensionArray
     n : int
         number of periods
-    axis : int
+    axis : {0, 1}
         axis to shift on
     stacklevel : int
         The stacklevel for the lost dtype warning.
@@ -1651,7 +1651,8 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     na = np.nan
     dtype = arr.dtype
 
-    if dtype.kind == "b":
+    is_bool = is_bool_dtype(dtype)
+    if is_bool:
         op = operator.xor
     else:
         op = operator.sub
@@ -1677,17 +1678,15 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
             dtype = arr.dtype
 
     is_timedelta = False
-    is_bool = False
     if needs_i8_conversion(arr.dtype):
         dtype = np.int64
         arr = arr.view("i8")
         na = iNaT
         is_timedelta = True
 
-    elif is_bool_dtype(dtype):
+    elif is_bool:
         # We have to cast in order to be able to hold np.nan
         dtype = np.object_
-        is_bool = True
 
     elif is_integer_dtype(dtype):
         # We have to cast in order to be able to hold np.nan
@@ -1708,45 +1707,26 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     dtype = np.dtype(dtype)
     out_arr = np.empty(arr.shape, dtype=dtype)
 
-    na_indexer = [slice(None)] * arr.ndim
+    na_indexer = [slice(None)] * 2
     na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
     out_arr[tuple(na_indexer)] = na
 
-    if arr.ndim == 2 and arr.dtype.name in _diff_special:
+    if arr.dtype.name in _diff_special:
         # TODO: can diff_2d dtype specialization troubles be fixed by defining
         #  out_arr inside diff_2d?
         algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
     else:
         # To keep mypy happy, _res_indexer is a list while res_indexer is
         #  a tuple, ditto for lag_indexer.
-        _res_indexer = [slice(None)] * arr.ndim
+        _res_indexer = [slice(None)] * 2
         _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
         res_indexer = tuple(_res_indexer)
 
-        _lag_indexer = [slice(None)] * arr.ndim
+        _lag_indexer = [slice(None)] * 2
         _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
         lag_indexer = tuple(_lag_indexer)
 
-        # need to make sure that we account for na for datelike/timedelta
-        # we don't actually want to subtract these i8 numbers
-        if is_timedelta:
-            res = arr[res_indexer]
-            lag = arr[lag_indexer]
-
-            mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)
-            if mask.any():
-                res = res.copy()
-                res[mask] = 0
-                lag = lag.copy()
-                lag[mask] = 0
-
-            result = res - lag
-            result[mask] = na
-            out_arr[res_indexer] = result
-        elif is_bool:
-            out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer]
-        else:
-            out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
+        out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
 
     if is_timedelta:
         out_arr = out_arr.view("timedelta64[ns]")
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -46,6 +46,7 @@ class providing the base-class of operations.
 )
 import pandas._libs.groupby as libgroupby
 from pandas._typing import (
+    ArrayLike,
     F,
     FrameOrSeries,
     FrameOrSeriesUnion,
@@ -68,7 +69,6 @@ class providing the base-class of operations.
     ensure_float,
     is_bool_dtype,
     is_datetime64_dtype,
-    is_extension_array_dtype,
     is_integer_dtype,
     is_numeric_dtype,
     is_object_dtype,
@@ -85,6 +85,7 @@ class providing the base-class of operations.
 from pandas.core.arrays import (
     Categorical,
     DatetimeArray,
+    ExtensionArray,
 )
 from pandas.core.base import (
     DataError,
@@ -2265,37 +2266,31 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
         """
         from pandas import concat
 
-        def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
+        def pre_processor(vals: ArrayLike) -> Tuple[np.ndarray, Optional[np.dtype]]:
             if is_object_dtype(vals):
                 raise TypeError(
                     "'quantile' cannot be performed against 'object' dtypes!"
                 )
 
-            inference = None
+            inference: Optional[np.dtype] = None
             if is_integer_dtype(vals.dtype):
-                if is_extension_array_dtype(vals.dtype):
-                    # error: "ndarray" has no attribute "to_numpy"
-                    vals = vals.to_numpy(  # type: ignore[attr-defined]
-                        dtype=float, na_value=np.nan
-                    )
-                inference = np.int64
-            elif is_bool_dtype(vals.dtype) and is_extension_array_dtype(vals.dtype):
-                # error: "ndarray" has no attribute "to_numpy"
-                vals = vals.to_numpy(  # type: ignore[attr-defined]
-                    dtype=float, na_value=np.nan
-                )
+                if isinstance(vals, ExtensionArray):
+                    out = vals.to_numpy(dtype=float, na_value=np.nan)
+                else:
+                    out = vals
+                inference = np.dtype(np.int64)
+            elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
+                out = vals.to_numpy(dtype=float, na_value=np.nan)
             elif is_datetime64_dtype(vals.dtype):
-                # error: Incompatible types in assignment (expression has type
-                # "str", variable has type "Optional[Type[int64]]")
-                inference = "datetime64[ns]"  # type: ignore[assignment]
-                vals = np.asarray(vals).astype(float)
+                inference = np.dtype("datetime64[ns]")
+                out = np.asarray(vals).astype(float)
             elif is_timedelta64_dtype(vals.dtype):
-                # error: Incompatible types in assignment (expression has type "str",
-                # variable has type "Optional[Type[signedinteger[Any]]]")
-                inference = "timedelta64[ns]"  # type: ignore[assignment]
-                vals = np.asarray(vals).astype(float)
+                inference = np.dtype("timedelta64[ns]")
+                out = np.asarray(vals).astype(float)
+            else:
+                out = np.asarray(vals)
 
-            return vals, inference
+            return out, inference
 
         def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
             if inference:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -65,6 +65,7 @@
     is_timedelta64_dtype,
     needs_i8_conversion,
 )
+from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import ABCCategoricalIndex
 from pandas.core.dtypes.missing import (
     isna,
@@ -522,7 +523,7 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str):
     @final
     def _ea_wrap_cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
-    ) -> Tuple[np.ndarray, Optional[List[str]]]:
+    ) -> np.ndarray:
         """
         If we have an ExtensionArray, unwrap, call _cython_operation, and
         re-wrap if appropriate.
@@ -539,10 +540,7 @@ def _ea_wrap_cython_operation(
             )
             if how in ["rank"]:
                 # preserve float64 dtype
-
-                # error: Incompatible return value type (got "ndarray", expected
-                # "Tuple[ndarray, Optional[List[str]]]")
-                return res_values  # type: ignore[return-value]
+                return res_values
 
             res_values = res_values.astype("i8", copy=False)
             result = type(orig_values)(res_values, dtype=orig_values.dtype)
@@ -555,14 +553,11 @@ def _ea_wrap_cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
-            if is_extension_array_dtype(dtype):
-                # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no
-                # attribute "construct_array_type"
-                cls = dtype.construct_array_type()  # type: ignore[union-attr]
+            if isinstance(dtype, ExtensionDtype):
+                cls = dtype.construct_array_type()
                 return cls._from_sequence(res_values, dtype=dtype)
-            # error: Incompatible return value type (got "ndarray", expected
-            # "Tuple[ndarray, Optional[List[str]]]")
-            return res_values  # type: ignore[return-value]
+
+            return res_values
 
         elif is_float_dtype(values.dtype):
             # FloatingArray
@@ -599,9 +594,7 @@ def _cython_operation(
         self._disallow_invalid_ops(values, how)
 
         if is_extension_array_dtype(values.dtype):
-            # error: Incompatible return value type (got "Tuple[ndarray,
-            # Optional[List[str]]]", expected "ndarray")
-            return self._ea_wrap_cython_operation(  # type: ignore[return-value]
+            return self._ea_wrap_cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3876,7 +3876,14 @@ def _reindex_non_unique(self, target):
     # --------------------------------------------------------------------
     # Join Methods
 
-    def join(self, other, how="left", level=None, return_indexers=False, sort=False):
+    def join(
+        self,
+        other,
+        how: str_t = "left",
+        level=None,
+        return_indexers: bool = False,
+        sort: bool = False,
+    ):
         """
         Compute join_index and indexers to conform data
         structures to the new index.
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -827,7 +827,12 @@ def _union(self, other, sort):
     _join_precedence = 10
 
     def join(
-        self, other, how: str = "left", level=None, return_indexers=False, sort=False
+        self,
+        other,
+        how: str = "left",
+        level=None,
+        return_indexers: bool = False,
+        sort: bool = False,
     ):
         """
         See Index.join
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -2,9 +2,11 @@
 
 import itertools
 from typing import (
+    TYPE_CHECKING,
     List,
     Optional,
     Union,
+    cast,
 )
 
 import numpy as np
@@ -44,6 +46,9 @@
     get_group_index_sorter,
 )
 
+if TYPE_CHECKING:
+    from pandas.core.arrays import ExtensionArray
+
 
 class _Unstacker:
     """
@@ -942,11 +947,11 @@ def _get_dummies_1d(
     data,
     prefix,
     prefix_sep="_",
-    dummy_na=False,
-    sparse=False,
-    drop_first=False,
+    dummy_na: bool = False,
+    sparse: bool = False,
+    drop_first: bool = False,
     dtype: Optional[Dtype] = None,
-):
+) -> DataFrame:
     from pandas.core.reshape.concat import concat
 
     # Series avoids inconsistent NaN handling
@@ -1029,6 +1034,8 @@ def get_empty_frame(data) -> DataFrame:
             sparse_series.append(Series(data=sarr, index=index, name=col))
 
         out = concat(sparse_series, axis=1, copy=False)
+        # TODO: overload concat with Literal for axis
+        out = cast(DataFrame, out)
         return out
 
     else:
@@ -1045,7 +1052,9 @@ def get_empty_frame(data) -> DataFrame:
         return DataFrame(dummy_mat, index=index, columns=dummy_cols)
 
 
-def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int):
+def _reorder_for_extension_array_stack(
+    arr: ExtensionArray, n_rows: int, n_columns: int
+) -> ExtensionArray:
     """
     Re-orders the values when stacking multiple extension-arrays.
 
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -43,7 +43,6 @@
 _INT64_MAX = np.iinfo(np.int64).max
 
 
-# error: Function "numpy.array" is not valid as a type
 def get_indexer_indexer(
     target: Index,
     level: Union[str, int, List[str], List[int]],
@@ -52,7 +51,7 @@ def get_indexer_indexer(
     na_position: str,
     sort_remaining: bool,
     key: IndexKeyFunc,
-) -> Optional[np.array]:  # type: ignore[valid-type]
+) -> Optional[np.ndarray]:
     """
     Helper method that return the indexer according to input parameters for
     the sort_index method of DataFrame and Series.
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -534,25 +534,19 @@ def _to_datetime_with_unit(arg, unit, name, tz, errors: Optional[str]) -> Index:
     # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
     # because it expects an ndarray argument
     if isinstance(arg, IntegerArray):
-        result = arg.astype(f"datetime64[{unit}]")
+        arr = arg.astype(f"datetime64[{unit}]")
         tz_parsed = None
     else:
-        result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
+        arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
 
     if errors == "ignore":
         # Index constructor _may_ infer to DatetimeIndex
-
-        # error: Incompatible types in assignment (expression has type "Index", variable
-        # has type "ExtensionArray")
-        result = Index(result, name=name)  # type: ignore[assignment]
+        result = Index(arr, name=name)
     else:
-        # error: Incompatible types in assignment (expression has type "DatetimeIndex",
-        # variable has type "ExtensionArray")
-        result = DatetimeIndex(result, name=name)  # type: ignore[assignment]
+        result = DatetimeIndex(arr, name=name)
 
     if not isinstance(result, DatetimeIndex):
-        # error: Incompatible return value type (got "ExtensionArray", expected "Index")
-        return result  # type: ignore[return-value]
+        return result
 
     # GH#23758: We may still need to localize the result with tz
     # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 
 from pandas._libs import lib
@@ -164,13 +166,10 @@ def to_numeric(arg, errors="raise", downcast=None):
 
     # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
     # save mask to reconstruct the full array after casting
+    mask: Optional[np.ndarray] = None
     if isinstance(values, NumericArray):
         mask = values._mask
         values = values._data[~mask]
-    else:
-        # error: Incompatible types in assignment (expression has type "None", variable
-        # has type "ndarray")
-        mask = None  # type: ignore[assignment]
 
     values_dtype = getattr(values, "dtype", None)
     if is_numeric_dtype(values_dtype):
diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py