Merge remote-tracking branch 'upstream/master' into docfix-multiindex-set_levels

hweecat · hweecat · commit 3c63b6f122a8 · 2019-12-31T22:25:37.000+08:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -836,6 +836,7 @@ Interval
 
 - Bug in :meth:`IntervalIndex.get_indexer` where a :class:`Categorical` or :class:`CategoricalIndex` ``target`` would incorrectly raise a ``TypeError`` (:issue:`30063`)
 - Bug in ``pandas.core.dtypes.cast.infer_dtype_from_scalar`` where passing ``pandas_dtype=True`` did not infer :class:`IntervalDtype` (:issue:`30337`)
+- Bug in :class:`IntervalDtype` where the ``kind`` attribute was incorrectly set as ``None`` instead of ``"O"`` (:issue:`30568`)
 
 Indexing
 ^^^^^^^^
@@ -921,6 +922,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`)
 - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`)
 - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`)
+- Bug in :meth:`GroupBy.pct_change` and :meth:`SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`)
 
 Reshaping
 ^^^^^^^^^
@@ -963,6 +965,7 @@ Other
 - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
 - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
 - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
+- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`)
 - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
 - Fix :class:`AbstractHolidayCalendar` to return correct results for
   years after 2030 (now goes up to 2200) (:issue:`27790`)
diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx
@@ -70,6 +70,12 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
             # null, stringify and encode
             data = <bytes>str(val).encode(encoding)
 
+        elif isinstance(val, tuple):
+            # GH#28969 we could have a tuple, but need to ensure that
+            #  the tuple entries are themselves hashable before converting
+            #  to str
+            hash(val)
+            data = <bytes>str(val).encode(encoding)
         else:
             raise TypeError(f"{val} of type {type(val)} is not a valid type "
                             "for hashing, must be string or null")
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -169,9 +169,9 @@ def __new__(cls) -> "DataFrame":  # type: ignore
 
 
 # our Unpickler sub-class to override methods and some dispatcher
-# functions for compat
-
+# functions for compat and uses a non-public class of the pickle module.
 
+# error: Name 'pkl._Unpickler' is not defined
 class Unpickler(pkl._Unpickler):  # type: ignore
     def find_class(self, module, name):
         # override superclass
diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -64,7 +64,7 @@ class SparseDtype(ExtensionDtype):
     # hash(nan) is (sometimes?) 0.
     _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
 
-    def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
+    def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
 
         if isinstance(dtype, type(self)):
             if fill_value is None:
diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py
@@ -2,10 +2,12 @@
 """
 
 from functools import partial, wraps
+from typing import Dict, Optional, Sequence, Tuple, Type, Union
 import warnings
 
 import numpy as np
 
+from pandas._typing import FrameOrSeries
 from pandas.errors import PerformanceWarning
 
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
@@ -15,22 +17,27 @@
 from pandas.core.computation.common import result_type_many
 
 
-def _align_core_single_unary_op(term):
+def _align_core_single_unary_op(
+    term,
+) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]:
+
+    typ: Union[partial, Type[FrameOrSeries]]
+    axes: Optional[Dict[str, int]] = None
+
     if isinstance(term.value, np.ndarray):
         typ = partial(np.asanyarray, dtype=term.value.dtype)
     else:
         typ = type(term.value)
-    ret = (typ,)
+        if hasattr(term.value, "axes"):
+            axes = _zip_axes_from_type(typ, term.value.axes)
 
-    if not hasattr(term.value, "axes"):
-        ret += (None,)
-    else:
-        ret += (_zip_axes_from_type(typ, term.value.axes),)
-    return ret
+    return typ, axes
 
 
-def _zip_axes_from_type(typ, new_axes):
-    axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()}
+def _zip_axes_from_type(
+    typ: Type[FrameOrSeries], new_axes: Sequence[int]
+) -> Dict[str, int]:
+    axes = {name: new_axes[i] for i, name in typ._AXIS_NAMES.items()}
     return axes
 
 
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -300,14 +300,15 @@ def table_schema_cb(key):
     _enable_data_resource_formatter(cf.get_option(key))
 
 
-def is_terminal():
+def is_terminal() -> bool:
     """
     Detect if Python is running in a terminal.
 
     Returns True if Python is running in a terminal or False if not.
     """
     try:
-        ip = get_ipython()
+        # error: Name 'get_ipython' is not defined
+        ip = get_ipython()  # type: ignore
     except NameError:  # assume standard Python interpreter in a terminal
         return True
     else:
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -633,7 +633,14 @@ def is_string_dtype(arr_or_dtype) -> bool:
 
     # TODO: gh-15585: consider making the checks stricter.
     def condition(dtype) -> bool:
-        return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype)
+        return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype)
+
+    def is_excluded_dtype(dtype) -> bool:
+        """
+        These have kind = "O" but aren't string dtypes so need to be explicitly excluded
+        """
+        is_excluded_checks = (is_period_dtype, is_interval_dtype)
+        return any(is_excluded(dtype) for is_excluded in is_excluded_checks)
 
     return _is_dtype(arr_or_dtype, condition)
 
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -974,7 +974,7 @@ class IntervalDtype(PandasExtensionDtype):
     """
 
     name = "interval"
-    kind: Optional[str_type] = None
+    kind: str_type = "O"
     str = "|O08"
     base = np.dtype("O")
     num = 103
diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py
@@ -4,7 +4,10 @@
 # define abstract base classes to enable isinstance type checking on our
 # objects
 def create_pandas_abc_type(name, attr, comp):
-    @classmethod
+
+    # https://github.com/python/mypy/issues/1006
+    # error: 'classmethod' used with a non-method
+    @classmethod  # type: ignore
     def _check(cls, inst) -> bool:
         return getattr(inst, attr, "_typ") in comp
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -759,6 +759,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
                     periods=periods, fill_method=fill_method, limit=limit, freq=freq
                 )
             )
+        if fill_method is None:  # GH30463
+            fill_method = "pad"
+            limit = 0
         filled = getattr(self, fill_method)(limit=limit)
         fill_grp = filled.groupby(self.grouper.codes)
         shifted = fill_grp.shift(periods=periods, freq=freq)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2340,6 +2340,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0
                     axis=axis,
                 )
             )
+        if fill_method is None:  # GH30463
+            fill_method = "pad"
+            limit = 0
         filled = getattr(self, fill_method)(limit=limit)
         fill_grp = filled.groupby(self.grouper.codes)
         shifted = fill_grp.shift(periods=periods, freq=freq)
@@ -2508,9 +2511,9 @@ def get_groupby(
     squeeze: bool = False,
     observed: bool = False,
     mutated: bool = False,
-):
+) -> GroupBy:
 
-    klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]]
+    klass: Type[GroupBy]
     if isinstance(obj, Series):
         from pandas.core.groupby.generic import SeriesGroupBy
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -1,5 +1,5 @@
 import operator
-from typing import Any
+from typing import Any, List
 
 import numpy as np
 
@@ -583,6 +583,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
 
         target = ibase.ensure_index(target)
 
+        missing: List[int]
         if self.equals(target):
             indexer = None
             missing = []
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -85,11 +85,12 @@ def hash_pandas_object(
     if isinstance(obj, ABCMultiIndex):
         return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
 
-    if isinstance(obj, ABCIndexClass):
+    elif isinstance(obj, ABCIndexClass):
         h = hash_array(obj.values, encoding, hash_key, categorize).astype(
             "uint64", copy=False
         )
         h = Series(h, index=obj, dtype="uint64", copy=False)
+
     elif isinstance(obj, ABCSeries):
         h = hash_array(obj.values, encoding, hash_key, categorize).astype(
             "uint64", copy=False
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
@@ -112,7 +112,7 @@ def nested_to_record(
 def _json_normalize(
     data: Union[Dict, List[Dict]],
     record_path: Optional[Union[str, List]] = None,
-    meta: Optional[Union[str, List]] = None,
+    meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
     meta_prefix: Optional[str] = None,
     record_prefix: Optional[str] = None,
     errors: Optional[str] = "raise",
@@ -265,21 +265,21 @@ def _pull_field(js, spec):
     elif not isinstance(meta, list):
         meta = [meta]
 
-    meta = [m if isinstance(m, list) else [m] for m in meta]
+    _meta = [m if isinstance(m, list) else [m] for m in meta]
 
     # Disastrously inefficient for now
     records: List = []
     lengths = []
 
     meta_vals: DefaultDict = defaultdict(list)
-    meta_keys = [sep.join(val) for val in meta]
+    meta_keys = [sep.join(val) for val in _meta]
 
     def _recursive_extract(data, path, seen_meta, level=0):
         if isinstance(data, dict):
             data = [data]
         if len(path) > 1:
             for obj in data:
-                for val, key in zip(meta, meta_keys):
+                for val, key in zip(_meta, meta_keys):
                     if level + 1 == len(val):
                         seen_meta[key] = _pull_field(obj, val[-1])
 
@@ -296,7 +296,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
 
                 # For repeating the metadata later
                 lengths.append(len(recs))
-                for val, key in zip(meta, meta_keys):
+                for val, key in zip(_meta, meta_keys):
                     if level + 1 > len(val):
                         meta_val = seen_meta[key]
                     else:
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -685,6 +685,10 @@ def test_caching(self):
         tm.round_trip_pickle(dtype)
         assert len(IntervalDtype._cache) == 0
 
+    def test_not_string(self):
+        # GH30568: though IntervalDtype has object kind, it cannot be string
+        assert not is_string_dtype(IntervalDtype())
+
 
 class TestCategoricalDtypeParametrized:
     @pytest.mark.parametrize(
diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py
@@ -16,8 +16,7 @@ def test_name(self, dtype):
 
     def test_kind(self, dtype):
         valid = set("biufcmMOSUV")
-        if dtype.kind is not None:
-            assert dtype.kind in valid
+        assert dtype.kind in valid
 
     def test_construct_from_string_own_name(self, dtype):
         result = dtype.construct_from_string(dtype.name)
diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
@@ -877,27 +877,19 @@ def test_pad_stable_sorting(fill_method):
         ),
     ],
 )
-@pytest.mark.parametrize(
-    "periods,fill_method,limit",
-    [
-        (1, "ffill", None),
-        (1, "ffill", 1),
-        (1, "bfill", None),
-        (1, "bfill", 1),
-        (-1, "ffill", None),
-        (-1, "ffill", 1),
-        (-1, "bfill", None),
-        (-1, "bfill", 1),
-    ],
-)
+@pytest.mark.parametrize("periods", [1, -1])
+@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None])
+@pytest.mark.parametrize("limit", [None, 1])
 def test_pct_change(test_series, freq, periods, fill_method, limit):
-    # GH  21200, 21621
+    # GH  21200, 21621, 30463
     vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4]
     keys = ["a", "b"]
     key_v = np.repeat(keys, len(vals))
     df = DataFrame({"key": key_v, "vals": vals * 2})
 
-    df_g = getattr(df.groupby("key"), fill_method)(limit=limit)
+    df_g = df
+    if fill_method is not None:
+        df_g = getattr(df.groupby("key"), fill_method)(limit=limit)
     grp = df_g.groupby(df.key)
 
     expected = grp["vals"].obj / grp["vals"].shift(periods) - 1
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
@@ -353,3 +353,24 @@ def test_hash_collisions():
 
     result = hash_array(np.asarray(hashes, dtype=object), "utf8")
     tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
+
+
+def test_hash_with_tuple():
+    # GH#28969 array containing a tuple raises on call to arr.astype(str)
+    #  apparently a numpy bug github.com/numpy/numpy/issues/9441
+
+    df = pd.DataFrame({"data": [tuple("1"), tuple("2")]})
+    result = hash_pandas_object(df)
+    expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64)
+    tm.assert_series_equal(result, expected)
+
+    df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]})
+    result = hash_pandas_object(df2)
+    expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64)
+    tm.assert_series_equal(result, expected)
+
+    # require that the elements of such tuples are themselves hashable
+
+    df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]})
+    with pytest.raises(TypeError, match="unhashable type: 'list'"):
+        hash_pandas_object(df3)
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
@@ -286,7 +286,7 @@ def _load_obj(name):
                 continue
 
         if "obj" not in locals():
-            raise ImportError("No module can be imported " 'from "{}"'.format(name))
+            raise ImportError(f'No module can be imported from "{name}"')
 
         for part in func_parts:
             obj = getattr(obj, part)
diff --git a/setup.cfg b/setup.cfg

Original file line number	Diff line number	Diff line change
`@@ -759,6 +759,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):`
`759`	`759`	`periods=periods, fill_method=fill_method, limit=limit, freq=freq`
`760`	`760`	`)`
`761`	`761`	`)`
	`762`	`+ if fill_method is None: # GH30463`
	`763`	`+ fill_method = "pad"`
	`764`	`+ limit = 0`
`762`	`765`	`filled = getattr(self, fill_method)(limit=limit)`
`763`	`766`	`fill_grp = filled.groupby(self.grouper.codes)`
`764`	`767`	`shifted = fill_grp.shift(periods=periods, freq=freq)`