From 5a030648a8e74d81806554614207774a5cdbede1 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 5 Oct 2022 07:27:46 +0100 Subject: [PATCH 01/20] ENH: Improve typing of some general functions --- pandas-stubs/core/algorithms.pyi | 15 +++- pandas-stubs/core/reshape/melt.pyi | 17 +++- tests/test_pandas.py | 133 ++++++++++++++++++++++++++++- 3 files changed, 160 insertions(+), 5 deletions(-) diff --git a/pandas-stubs/core/algorithms.pyi b/pandas-stubs/core/algorithms.pyi index 5663a0c77..1ef1905f5 100644 --- a/pandas-stubs/core/algorithms.pyi +++ b/pandas-stubs/core/algorithms.pyi @@ -4,9 +4,14 @@ from typing import ( ) import numpy as np +import pandas as pd from pandas import ( Categorical, + CategoricalIndex, + DatetimeIndex, Index, + PeriodIndex, + RangeIndex, Series, ) from pandas.api.extensions import ExtensionArray @@ -14,7 +19,15 @@ from pandas.api.extensions import ExtensionArray from pandas._typing import AnyArrayLike @overload -def unique(values: Index) -> Index: ... +def unique(values: DatetimeIndex) -> DatetimeIndex: ... +@overload +def unique(values: PeriodIndex) -> PeriodIndex: ... +@overload +def unique(values: CategoricalIndex) -> CategoricalIndex: ... +@overload +def unique(values: RangeIndex | pd.Float64Index) -> np.ndarray: ... +@overload +def unique(values: Index) -> Index | np.ndarray: ... @overload def unique(values: Categorical) -> Categorical: ... @overload diff --git a/pandas-stubs/core/reshape/melt.pyi b/pandas-stubs/core/reshape/melt.pyi index dc40b68fa..e8c01ec65 100644 --- a/pandas-stubs/core/reshape/melt.pyi +++ b/pandas-stubs/core/reshape/melt.pyi @@ -1,16 +1,27 @@ +from typing import Hashable + import numpy as np from pandas.core.frame import DataFrame +from pandas._typing import HashableT + def melt( frame: DataFrame, id_vars: tuple | list | np.ndarray | None = ..., value_vars: tuple | list | np.ndarray | None = ..., var_name: str | None = ..., - value_name: str = ..., + value_name: Hashable = ..., col_level: int | str | None = ..., ignore_index: bool = ..., ) -> DataFrame: ... -def lreshape(data: DataFrame, groups, dropna: bool = ..., label=...) -> DataFrame: ... +def lreshape( + data: DataFrame, groups: dict[HashableT, list[HashableT]], dropna: bool = ... +) -> DataFrame: ... def wide_to_long( - df: DataFrame, stubnames, i, j, sep: str = ..., suffix: str = ... + df: DataFrame, + stubnames: str | list[str], + i: str | list[str], + j: str, + sep: str = ..., + suffix: str = ..., ) -> DataFrame: ... diff --git a/tests/test_pandas.py b/tests/test_pandas.py index f44afc595..4a716e9c7 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -208,7 +208,7 @@ def test_unique() -> None: ] ) ), - pd.Index, + Union[pd.Index, np.ndarray], ), pd.DatetimeIndex, ) @@ -246,6 +246,34 @@ def test_unique() -> None: ), np.ndarray, ) + check( + assert_type( + pd.unique(pd.Index(["a", "b", "c", "a"])), Union[pd.Index, np.ndarray] + ), + np.ndarray, + ) + check( + assert_type(pd.unique(pd.RangeIndex(0, 10)), np.ndarray), + np.ndarray, + ) + check( + assert_type(pd.unique(pd.Categorical(["a", "b", "c", "a"])), pd.Categorical), + pd.Categorical, + ) + check( + assert_type( + pd.unique(pd.period_range("2001Q1", periods=10, freq="D")), + pd.PeriodIndex, + ), + pd.PeriodIndex, + ) + check( + assert_type( + pd.unique(pd.timedelta_range(start="1 day", periods=4)), + Union[pd.Index, np.ndarray], + ), + np.ndarray, + ) # GH 200 @@ -316,3 +344,106 @@ def test_eval(): ), pd.DataFrame, ) + + +def test_wide_to_long(): + df = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), np.random.randn(3))), + } + ) + df["id"] = df.index + df["id2"] = df.index + 1 + check( + assert_type(pd.wide_to_long(df, ["A", "B"], i="id", j="year"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type( + pd.wide_to_long(df, ["A", "B"], i=["id", "id2"], j="year"), pd.DataFrame + ), + pd.DataFrame, + ) + + +def test_melt(): + df = pd.DataFrame( + { + "A": {0: "a", 1: "b", 2: "c"}, + "B": {0: 1, 1: 3, 2: 5}, + "C": {0: 2, 1: 4, 2: 6}, + "D": {0: 3, 1: 6, 2: 9}, + "E": {0: 3, 1: 6, 2: 9}, + } + ) + check( + assert_type( + pd.melt(df, id_vars=["A"], value_vars=["B"], ignore_index=False), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.melt(df, id_vars=["A"], value_vars=["B"], value_name=("F",)), + pd.DataFrame, + ), + pd.DataFrame, + ) + df.columns = pd.MultiIndex.from_arrays([list("ABCDE"), list("FGHIJ")]) + check( + assert_type( + pd.melt( + df, id_vars=["A"], value_vars=["B"], ignore_index=False, col_level=0 + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + + +def test_lreshape() -> None: + data = pd.DataFrame( + { + "hr1": [514, 573], + "hr2": [545, 526], + "team": ["Red Sox", "Yankees"], + "year1": [2007, 2007], + "year2": [2008, 2008], + } + ) + check( + assert_type( + pd.lreshape( + data, {"year": ["year1", "year2"], "hr": ["hr1", "hr2"]}, dropna=True + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + data2 = pd.DataFrame( + { + "hr1": [514, 573], + ("hr2",): [545, 526], + "team": ["Red Sox", "Yankees"], + ("year1",): [2007, 2007], + "year2": [2008, 2008], + } + ) + from typing import Hashable + + groups: dict[Hashable, list[Hashable]] = { + ("year",): [("year1",), "year2"], + ("hr",): ["hr1", ("hr2",)], + } + check( + assert_type( + pd.lreshape(data2, groups=groups), + pd.DataFrame, + ), + pd.DataFrame, + ) From d3a07a0de27cc4f7546cdff111ff87fa9394e9be Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 5 Oct 2022 23:23:53 +0100 Subject: [PATCH 02/20] Further typing --- pandas-stubs/core/algorithms.pyi | 11 +++++++---- tests/test_pandas.py | 24 ++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/pandas-stubs/core/algorithms.pyi b/pandas-stubs/core/algorithms.pyi index 1ef1905f5..972f66ccd 100644 --- a/pandas-stubs/core/algorithms.pyi +++ b/pandas-stubs/core/algorithms.pyi @@ -1,5 +1,5 @@ from typing import ( - Any, + Sequence, overload, ) @@ -16,7 +16,10 @@ from pandas import ( ) from pandas.api.extensions import ExtensionArray -from pandas._typing import AnyArrayLike +from pandas._typing import ( + AnyArrayLike, + npt, +) @overload def unique(values: DatetimeIndex) -> DatetimeIndex: ... @@ -37,13 +40,13 @@ def unique(values: np.ndarray | list) -> np.ndarray: ... @overload def unique(values: ExtensionArray) -> ExtensionArray: ... def factorize( - values: Any, + values: Sequence | AnyArrayLike, sort: bool = ..., # Not actually positional-only, used to handle deprecations in 1.5.0 *, use_na_sentinel: bool = ..., size_hint: int | None = ..., -) -> tuple[np.ndarray, np.ndarray | Index]: ... +) -> tuple[np.ndarray, np.ndarray | Index | Categorical]: ... def value_counts( values: AnyArrayLike | list | tuple, sort: bool = ..., diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 4a716e9c7..5a1107e19 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -8,13 +8,15 @@ ) import numpy as np -from numpy import typing as npt import pandas as pd from pandas.api.extensions import ExtensionArray import pytest from typing_extensions import assert_type -from pandas._typing import Scalar +from pandas._typing import ( + Scalar, + npt, +) from tests import check @@ -447,3 +449,21 @@ def test_lreshape() -> None: ), pd.DataFrame, ) + + +def test_factorize() -> None: + codes, uniques = pd.factorize(["b", "b", "a", "c", "b"]) + check(assert_type(codes, np.ndarray), np.ndarray) + check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) + codes, uniques = pd.factorize(pd.Series(["b", "b", "a", "c", "b"])) + check(assert_type(codes, np.ndarray), np.ndarray) + check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) + codes, uniques = pd.factorize("bbacb") + check(assert_type(codes, np.ndarray), np.ndarray) + check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) + + codes, uniques = pd.factorize( + ["b", "b", "a", "c", "b"], use_na_sentinel=True, size_hint=10 + ) + check(assert_type(codes, np.ndarray), np.ndarray) + check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) From 936edb0772447c0743b0f9cb3c07cc28f5128446 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 07:40:07 +0100 Subject: [PATCH 03/20] ENH: Add overload to factorize --- pandas-stubs/core/algorithms.pyi | 28 ++++++++++++++++++++++------ tests/test_pandas.py | 20 +++++++++++++++----- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/pandas-stubs/core/algorithms.pyi b/pandas-stubs/core/algorithms.pyi index 972f66ccd..35efb5582 100644 --- a/pandas-stubs/core/algorithms.pyi +++ b/pandas-stubs/core/algorithms.pyi @@ -16,10 +16,7 @@ from pandas import ( ) from pandas.api.extensions import ExtensionArray -from pandas._typing import ( - AnyArrayLike, - npt, -) +from pandas._typing import AnyArrayLike @overload def unique(values: DatetimeIndex) -> DatetimeIndex: ... @@ -39,14 +36,33 @@ def unique(values: Series) -> np.ndarray | ExtensionArray: ... def unique(values: np.ndarray | list) -> np.ndarray: ... @overload def unique(values: ExtensionArray) -> ExtensionArray: ... +@overload +def factorize( + values: Sequence, + sort: bool = ..., + # Not actually positional-only, used to handle deprecations in 1.5.0 + *, + use_na_sentinel: bool = ..., + size_hint: int | None = ..., +) -> tuple[np.ndarray, np.ndarray]: ... +@overload +def factorize( + values: Index | Series, + sort: bool = ..., + # Not actually positional-only, used to handle deprecations in 1.5.0 + *, + use_na_sentinel: bool = ..., + size_hint: int | None = ..., +) -> tuple[np.ndarray, Index]: ... +@overload def factorize( - values: Sequence | AnyArrayLike, + values: Categorical, sort: bool = ..., # Not actually positional-only, used to handle deprecations in 1.5.0 *, use_na_sentinel: bool = ..., size_hint: int | None = ..., -) -> tuple[np.ndarray, np.ndarray | Index | Categorical]: ... +) -> tuple[np.ndarray, Categorical]: ... def value_counts( values: AnyArrayLike | list | tuple, sort: bool = ..., diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 5a1107e19..3e9d9d3ea 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -454,16 +454,26 @@ def test_lreshape() -> None: def test_factorize() -> None: codes, uniques = pd.factorize(["b", "b", "a", "c", "b"]) check(assert_type(codes, np.ndarray), np.ndarray) - check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) - codes, uniques = pd.factorize(pd.Series(["b", "b", "a", "c", "b"])) + check(assert_type(uniques, np.ndarray), np.ndarray) + + codes, cat_uniques = pd.factorize(pd.Categorical(["b", "b", "a", "c", "b"])) + check(assert_type(codes, np.ndarray), np.ndarray) + check(assert_type(cat_uniques, pd.Categorical), pd.Categorical) + + codes, idx_uniques = pd.factorize(pd.Index(["b", "b", "a", "c", "b"])) check(assert_type(codes, np.ndarray), np.ndarray) - check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) + check(assert_type(idx_uniques, pd.Index), pd.Index) + + codes, idx_uniques = pd.factorize(pd.Series(["b", "b", "a", "c", "b"])) + check(assert_type(codes, np.ndarray), np.ndarray) + check(assert_type(idx_uniques, pd.Index), pd.Index) + codes, uniques = pd.factorize("bbacb") check(assert_type(codes, np.ndarray), np.ndarray) - check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) + check(assert_type(uniques, np.ndarray), np.ndarray) codes, uniques = pd.factorize( ["b", "b", "a", "c", "b"], use_na_sentinel=True, size_hint=10 ) check(assert_type(codes, np.ndarray), np.ndarray) - check(assert_type(uniques, Union[pd.Index, pd.Categorical, np.ndarray]), np.ndarray) + check(assert_type(uniques, np.ndarray), np.ndarray) From f6e38ed38db0393a8d9bfbae9c8ef397cc8ffd26 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 07:45:40 +0100 Subject: [PATCH 04/20] BUG: Correct npt import --- tests/test_pandas.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 3e9d9d3ea..3be8176a6 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -8,15 +8,13 @@ ) import numpy as np +import numpy.typing as npt import pandas as pd from pandas.api.extensions import ExtensionArray import pytest from typing_extensions import assert_type -from pandas._typing import ( - Scalar, - npt, -) +from pandas._typing import Scalar from tests import check From e68812feba1bef808b1e04db3a913302454f2e3d Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 08:05:26 +0100 Subject: [PATCH 05/20] ENH: Improve unique --- pandas-stubs/core/algorithms.pyi | 5 ++--- tests/test_pandas.py | 28 +++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas-stubs/core/algorithms.pyi b/pandas-stubs/core/algorithms.pyi index 35efb5582..2c6ff297b 100644 --- a/pandas-stubs/core/algorithms.pyi +++ b/pandas-stubs/core/algorithms.pyi @@ -10,6 +10,7 @@ from pandas import ( CategoricalIndex, DatetimeIndex, Index, + IntervalIndex, PeriodIndex, RangeIndex, Series, @@ -18,14 +19,12 @@ from pandas.api.extensions import ExtensionArray from pandas._typing import AnyArrayLike -@overload -def unique(values: DatetimeIndex) -> DatetimeIndex: ... @overload def unique(values: PeriodIndex) -> PeriodIndex: ... @overload def unique(values: CategoricalIndex) -> CategoricalIndex: ... @overload -def unique(values: RangeIndex | pd.Float64Index) -> np.ndarray: ... +def unique(values: IntervalIndex) -> IntervalIndex: ... @overload def unique(values: Index) -> Index | np.ndarray: ... @overload diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 3be8176a6..b5524b0f8 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -253,7 +253,7 @@ def test_unique() -> None: np.ndarray, ) check( - assert_type(pd.unique(pd.RangeIndex(0, 10)), np.ndarray), + assert_type(pd.unique(pd.RangeIndex(0, 10)), Union[pd.Index, np.ndarray]), np.ndarray, ) check( @@ -475,3 +475,29 @@ def test_factorize() -> None: ) check(assert_type(codes, np.ndarray), np.ndarray) check(assert_type(uniques, np.ndarray), np.ndarray) + + +def test_index_unqiue() -> None: + ci = pd.CategoricalIndex(["a", "b", "a", "c"]) + dti = pd.DatetimeIndex([pd.Timestamp(2000, 1, 1)]) + fi = pd.Float64Index([1.0, 2.0]) + i = pd.Index(["a", "b", "c", "a"]) + i64i = pd.Int64Index([1, 2, 3, 4]) + pi = pd.PeriodIndex(["2000Q1"], freq="Q") + ri = pd.RangeIndex(0, 10) + ui = pd.UInt64Index([0, 1, 2, 3, 5]) + tdi = pd.timedelta_range("1 day", "10 days", periods=10) + mi = pd.MultiIndex.from_product([["a", "b"], ["apple", "banana"]]) + interval_i = pd.interval_range(1, 10, periods=10) + + check(assert_type(pd.unique(ci), pd.CategoricalIndex), pd.CategoricalIndex) + check(assert_type(pd.unique(dti), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(fi), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(i), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(i64i), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(pi), pd.PeriodIndex), pd.PeriodIndex) + check(assert_type(pd.unique(ri), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(ui), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(tdi), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(mi), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(interval_i), pd.IntervalIndex), pd.IntervalIndex) From 65538fefb309510f02f27619e0a53890f1c202bd Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 15:48:07 +0100 Subject: [PATCH 06/20] ENH: Imrpveo typing in merge functions, cut and qcut --- pandas-stubs/_typing.pyi | 10 ++ pandas-stubs/core/frame.pyi | 13 +- pandas-stubs/core/reshape/merge.pyi | 142 ++++++------------ pandas-stubs/core/reshape/tile.pyi | 215 ++++++++++++++++++++++++++-- 4 files changed, 259 insertions(+), 121 deletions(-) diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi index 4ef2c6c75..19a453c7a 100644 --- a/pandas-stubs/_typing.pyi +++ b/pandas-stubs/_typing.pyi @@ -332,4 +332,14 @@ class StyleExportDict(TypedDict, total=False): CalculationMethod: TypeAlias = Literal["single", "table"] +ValidationOptions: TypeAlias = Literal[ + "one_to_one", + "1:1", + "one_to_many", + "1:m", + "many_to_one", + "m:1", + "many_to_many", + "m:m", +] __all__ = ["npt", "type_t"] diff --git a/pandas-stubs/core/frame.pyi b/pandas-stubs/core/frame.pyi index ac53243c1..de4dab9ca 100644 --- a/pandas-stubs/core/frame.pyi +++ b/pandas-stubs/core/frame.pyi @@ -97,6 +97,7 @@ from pandas._typing import ( Suffixes, T as TType, TimestampConvention, + ValidationOptions, WriteBuffer, XMLParsers, np_ndarray_bool, @@ -1105,17 +1106,7 @@ class DataFrame(NDFrame, OpsMixin): lsuffix: _str = ..., rsuffix: _str = ..., sort: _bool = ..., - validate: Literal[ - "one_to_one", - "1:1", - "one_to_many", - "1:m", - "many_to_one", - "m:1", - "many_to_many", - "m:m", - ] - | None = ..., + validate: ValidationOptions | None = ..., ) -> DataFrame: ... def merge( self, diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index 8233f51c2..f014be0c5 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -1,4 +1,8 @@ -from typing import Sequence +from typing import ( + Hashable, + Literal, + Sequence, +) from pandas import ( DataFrame, @@ -8,131 +12,65 @@ from pandas import ( from pandas._libs.tslibs import Timedelta from pandas._typing import ( AnyArrayLike, + HashableT, Label, + ValidationOptions, ) def merge( + # TODO: Verify Series is accepted and correct in docs left: DataFrame | Series, right: DataFrame | Series, - how: str = ..., - on: Label | Sequence | AnyArrayLike | None = ..., - left_on: Label | Sequence | AnyArrayLike | None = ..., - right_on: Label | Sequence | AnyArrayLike | None = ..., + how: Literal["left", "right", "outer", "inner", "cross"] = ..., + on: Label | list[HashableT] | AnyArrayLike | None = ..., + left_on: Label | list[HashableT] | AnyArrayLike | None = ..., + right_on: Label | list[HashableT] | AnyArrayLike | None = ..., left_index: bool = ..., right_index: bool = ..., sort: bool = ..., - suffixes: Sequence[str | None] = ..., + suffixes: list[str | None] + | tuple[str, str] + | tuple[None, str] + | tuple[str, None] = ..., copy: bool = ..., indicator: bool | str = ..., - validate: str = ..., + validate: ValidationOptions = ..., ) -> DataFrame: ... def merge_ordered( + # TODO: Verify Series is accepted and correct in docs left: DataFrame | Series, + # TODO: Verify Series is accepted and correct in docs right: DataFrame | Series, - on: Label | Sequence | AnyArrayLike | None = ..., - left_on: Label | Sequence | AnyArrayLike | None = ..., - right_on: Label | Sequence | AnyArrayLike | None = ..., - left_by: str | list[str] | None = ..., - right_by: str | list[str] | None = ..., - fill_method: str | None = ..., - suffixes: Sequence[str | None] = ..., - how: str = ..., + on: Label | list[HashableT] | AnyArrayLike | None = ..., + left_on: Label | list[HashableT] | AnyArrayLike | None = ..., + right_on: Label | list[HashableT] | AnyArrayLike | None = ..., + left_by: Label | list[HashableT] | None = ..., + right_by: Label | list[HashableT] | None = ..., + fill_method: Literal["ffill"] | None = ..., + suffixes: list[str | None] + | tuple[str, str] + | tuple[None, str] + | tuple[str, None] = ..., + how: Literal["left", "right", "outer", "inner"] = ..., ) -> DataFrame: ... def merge_asof( left: DataFrame | Series, right: DataFrame | Series, on: Label | None = ..., + # TODO: Is AnyArrayLike accepted? Not in docs left_on: Label | AnyArrayLike | None = ..., + # TODO: Is AnyArrayLike accepted? Not in docs right_on: Label | AnyArrayLike | None = ..., left_index: bool = ..., right_index: bool = ..., - by: str | list[str] | None = ..., - left_by: str | None = ..., - right_by: str | None = ..., - suffixes: Sequence[str | None] = ..., + by: Label | list[HashableT] | None = ..., + left_by: Label | None = ..., + right_by: Label | None = ..., + suffixes: list[str | None] + | tuple[str, str] + | tuple[None, str] + | tuple[str, None] = ..., tolerance: int | Timedelta | None = ..., allow_exact_matches: bool = ..., - direction: str = ..., + direction: Literal["backward", "forward", "nearest"] = ..., ) -> DataFrame: ... - -class _MergeOperation: - left = ... - right = ... - how = ... - axis = ... - on = ... - left_on = ... - right_on = ... - copy = ... - suffixes = ... - sort = ... - left_index = ... - right_index = ... - indicator = ... - indicator_name = ... - def __init__( - self, - left: Series | DataFrame, - right: Series | DataFrame, - how: str = ..., - on=..., - left_on=..., - right_on=..., - axis=..., - left_index: bool = ..., - right_index: bool = ..., - sort: bool = ..., - suffixes=..., - copy: bool = ..., - indicator: bool = ..., - validate=..., - ) -> None: ... - def get_result(self): ... - -class _OrderedMerge(_MergeOperation): - fill_method = ... - def __init__( - self, - left, - right, - on=..., - left_on=..., - right_on=..., - left_index: bool = ..., - right_index: bool = ..., - axis=..., - suffixes=..., - copy: bool = ..., - fill_method=..., - how: str = ..., - ) -> None: ... - def get_result(self): ... - -class _AsOfMerge(_OrderedMerge): - by = ... - left_by = ... - right_by = ... - tolerance = ... - allow_exact_matches = ... - direction = ... - def __init__( - self, - left, - right, - on=..., - left_on=..., - right_on=..., - left_index: bool = ..., - right_index: bool = ..., - by=..., - left_by=..., - right_by=..., - axis=..., - suffixes=..., - copy: bool = ..., - fill_method=..., - how: str = ..., - tolerance=..., - allow_exact_matches: bool = ..., - direction: str = ..., - ) -> None: ... diff --git a/pandas-stubs/core/reshape/tile.pyi b/pandas-stubs/core/reshape/tile.pyi index 31970a9fc..fa7ac4252 100644 --- a/pandas-stubs/core/reshape/tile.pyi +++ b/pandas-stubs/core/reshape/tile.pyi @@ -1,13 +1,212 @@ +from typing import ( + Literal, + Sequence, + overload, +) + +from pandas import ( + Categorical, + Float64Index, + Index, + Int64Index, + IntervalIndex, + Series, +) + +from pandas._typing import ( + ArrayLike, + Label, + npt, +) + +@overload def cut( - x, - bins, + x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + bins: int | Series | Int64Index | Float64Index | Sequence[int] | Sequence[float], right: bool = ..., - labels=..., - retbins: bool = ..., + *, + labels: Literal[False], + retbins: Literal[True], precision: int = ..., include_lowest: bool = ..., - duplicates: str = ..., -): ... + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> tuple[npt.NDArray, npt.NDArray]: ... +@overload +def cut( + x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + bins: IntervalIndex, + right: bool = ..., + *, + labels: Literal[False], + retbins: Literal[True], + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> tuple[npt.NDArray, IntervalIndex]: ... +@overload +def cut( + x: Categorical, + bins: int | Series | Int64Index | Float64Index | Sequence[int] | Sequence[float], + right: bool = ..., + labels: Sequence[Label] | None = ..., + *, + retbins: Literal[True], + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> tuple[Categorical, npt.NDArray]: ... +@overload +def cut( + x: Categorical, + bins: IntervalIndex, + right: bool = ..., + labels: Sequence[Label] | None = ..., + *, + retbins: Literal[True], + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> tuple[Categorical, IntervalIndex]: ... +@overload +def cut( + x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + bins: int | Series | Int64Index | Float64Index | Sequence[int] | Sequence[float], + right: bool = ..., + labels: Sequence[Label] | None = ..., + *, + retbins: Literal[True], + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> tuple[Series, npt.NDArray]: ... +@overload +def cut( + x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + bins: IntervalIndex, + right: bool = ..., + labels: Sequence[Label] | None = ..., + *, + retbins: Literal[True], + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> tuple[Series, IntervalIndex]: ... +@overload +def cut( + x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + bins: int + | Series + | Int64Index + | Float64Index + | Sequence[int] + | Sequence[float] + | IntervalIndex, + right: bool = ..., + *, + labels: Literal[False], + retbins: Literal[False] = ..., + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> npt.NDArray: ... +@overload +def cut( + x: Categorical, + bins: int + | Series + | Int64Index + | Float64Index + | Sequence[int] + | Sequence[float] + | IntervalIndex, + right: bool = ..., + labels: Sequence[Label] | None = ..., + retbins: Literal[False] = ..., + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> Categorical: ... +@overload +def cut( + x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + bins: int + | Series + | Int64Index + | Float64Index + | Sequence[int] + | Sequence[float] + | IntervalIndex, + right: bool = ..., + labels: Sequence[Label] | None = ..., + retbins: Literal[False] = ..., + precision: int = ..., + include_lowest: bool = ..., + duplicates: Literal["raise", "drop"] = ..., + ordered: bool = ..., +) -> Series: ... +@overload def qcut( - x, q, labels=..., retbins: bool = ..., precision: int = ..., duplicates: str = ... -): ... + x: npt.NDArray | Series, + q: int | Sequence[float] | Series[float] | Float64Index, + *, + labels: Literal[False], + retbins: Literal[False] = ..., + precision: int = ..., + duplicates: Literal["raise", "drop"] = ..., +) -> npt.NDArray: ... +@overload +def qcut( + x: npt.NDArray, + q: int | Sequence[float] | Series[float] | Float64Index, + labels: Sequence[Label] | None = ..., + retbins: Literal[False] = ..., + precision: int = ..., + duplicates: Literal["raise", "drop"] = ..., +) -> Categorical: ... +@overload +def qcut( + x: Series, + q: int | Sequence[float] | Series[float] | Float64Index, + labels: Sequence[Label] | None = ..., + retbins: Literal[False] = ..., + precision: int = ..., + duplicates: Literal["raise", "drop"] = ..., +) -> Series: ... +@overload +def qcut( + x: npt.NDArray | Series, + q: int | Sequence[float] | Series[float] | Float64Index, + *, + labels: Literal[False], + retbins: Literal[True], + precision: int = ..., + duplicates: Literal["raise", "drop"] = ..., +) -> tuple[npt.NDArray, npt.NDArray]: ... +@overload +def qcut( + x: Series, + q: int | Sequence[float] | Series[float] | Float64Index, + labels: Sequence[Label] | None = ..., + *, + retbins: Literal[True], + precision: int = ..., + duplicates: Literal["raise", "drop"] = ..., +) -> tuple[Series, npt.NDArray]: ... +@overload +def qcut( + x: npt.NDArray, + q: int | Sequence[float] | Series[float] | Float64Index, + labels: Sequence[Label] | None = ..., + *, + retbins: Literal[True], + precision: int = ..., + duplicates: Literal["raise", "drop"] = ..., +) -> tuple[Categorical, npt.NDArray]: ... From 220c5615c4aa5b570322cfd19e06910929fea369 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 18:42:03 +0100 Subject: [PATCH 07/20] CLN: Remove unused imports --- pandas-stubs/core/algorithms.pyi | 2 -- pandas-stubs/core/reshape/merge.pyi | 6 +----- pandas-stubs/core/reshape/tile.pyi | 1 - tests/test_pandas.py | 2 +- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas-stubs/core/algorithms.pyi b/pandas-stubs/core/algorithms.pyi index 2c6ff297b..c28ebcff5 100644 --- a/pandas-stubs/core/algorithms.pyi +++ b/pandas-stubs/core/algorithms.pyi @@ -8,11 +8,9 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, - DatetimeIndex, Index, IntervalIndex, PeriodIndex, - RangeIndex, Series, ) from pandas.api.extensions import ExtensionArray diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index f014be0c5..e2a09e76e 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -1,8 +1,4 @@ -from typing import ( - Hashable, - Literal, - Sequence, -) +from typing import Literal from pandas import ( DataFrame, diff --git a/pandas-stubs/core/reshape/tile.pyi b/pandas-stubs/core/reshape/tile.pyi index fa7ac4252..27be9b2ba 100644 --- a/pandas-stubs/core/reshape/tile.pyi +++ b/pandas-stubs/core/reshape/tile.pyi @@ -14,7 +14,6 @@ from pandas import ( ) from pandas._typing import ( - ArrayLike, Label, npt, ) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index b5524b0f8..f6189c619 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -483,7 +483,7 @@ def test_index_unqiue() -> None: fi = pd.Float64Index([1.0, 2.0]) i = pd.Index(["a", "b", "c", "a"]) i64i = pd.Int64Index([1, 2, 3, 4]) - pi = pd.PeriodIndex(["2000Q1"], freq="Q") + pi = pd.period_range("2000Q1", periods=2, freq="Q") ri = pd.RangeIndex(0, 10) ui = pd.UInt64Index([0, 1, 2, 3, 5]) tdi = pd.timedelta_range("1 day", "10 days", periods=10) From 9e61288faa61eb7806054c94b028893bd1559844 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 18:51:44 +0100 Subject: [PATCH 08/20] CLN: Catch warning --- tests/test_pandas.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index f6189c619..2b2ff193e 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -480,12 +480,15 @@ def test_factorize() -> None: def test_index_unqiue() -> None: ci = pd.CategoricalIndex(["a", "b", "a", "c"]) dti = pd.DatetimeIndex([pd.Timestamp(2000, 1, 1)]) - fi = pd.Float64Index([1.0, 2.0]) + with pytest.warns(FutureWarning, match="pandas.Float64Index is deprecated"): + fi = pd.Float64Index([1.0, 2.0]) i = pd.Index(["a", "b", "c", "a"]) - i64i = pd.Int64Index([1, 2, 3, 4]) + with pytest.warns(FutureWarning, match="pandas.Int64Index is deprecated"): + i64i = pd.Int64Index([1, 2, 3, 4]) pi = pd.period_range("2000Q1", periods=2, freq="Q") ri = pd.RangeIndex(0, 10) - ui = pd.UInt64Index([0, 1, 2, 3, 5]) + with pytest.warns(FutureWarning, match="pandas.UInt64Index is deprecated"): + ui = pd.UInt64Index([0, 1, 2, 3, 5]) tdi = pd.timedelta_range("1 day", "10 days", periods=10) mi = pd.MultiIndex.from_product([["a", "b"], ["apple", "banana"]]) interval_i = pd.interval_range(1, 10, periods=10) From 5e0df695e3224433878c4a9a3258a395bfa79dac Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 22:56:28 +0100 Subject: [PATCH 09/20] TST: Add tests for cut and fixes --- pandas-stubs/core/reshape/tile.pyi | 39 ++++++++++---------- tests/test_pandas.py | 59 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/pandas-stubs/core/reshape/tile.pyi b/pandas-stubs/core/reshape/tile.pyi index 27be9b2ba..aafbcf8b2 100644 --- a/pandas-stubs/core/reshape/tile.pyi +++ b/pandas-stubs/core/reshape/tile.pyi @@ -4,6 +4,7 @@ from typing import ( overload, ) +import numpy as np from pandas import ( Categorical, Float64Index, @@ -20,7 +21,7 @@ from pandas._typing import ( @overload def cut( - x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + x: Index | npt.NDArray | Sequence[int] | Sequence[float], bins: int | Series | Int64Index | Float64Index | Sequence[int] | Sequence[float], right: bool = ..., *, @@ -30,10 +31,10 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> tuple[npt.NDArray, npt.NDArray]: ... +) -> tuple[npt.NDArray[np.intp], npt.NDArray]: ... @overload def cut( - x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + x: Index | npt.NDArray | Sequence[int] | Sequence[float], bins: IntervalIndex, right: bool = ..., *, @@ -46,20 +47,20 @@ def cut( ) -> tuple[npt.NDArray, IntervalIndex]: ... @overload def cut( - x: Categorical, + x: Series, bins: int | Series | Int64Index | Float64Index | Sequence[int] | Sequence[float], right: bool = ..., - labels: Sequence[Label] | None = ..., + labels: Literal[False] | Sequence[Label] | None = ..., *, retbins: Literal[True], precision: int = ..., include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> tuple[Categorical, npt.NDArray]: ... +) -> tuple[Series, npt.NDArray]: ... @overload def cut( - x: Categorical, + x: Series, bins: IntervalIndex, right: bool = ..., labels: Sequence[Label] | None = ..., @@ -69,10 +70,10 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> tuple[Categorical, IntervalIndex]: ... +) -> tuple[Series, IntervalIndex]: ... @overload def cut( - x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + x: Index | npt.NDArray | Sequence[int] | Sequence[float], bins: int | Series | Int64Index | Float64Index | Sequence[int] | Sequence[float], right: bool = ..., labels: Sequence[Label] | None = ..., @@ -82,10 +83,10 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> tuple[Series, npt.NDArray]: ... +) -> tuple[Categorical, npt.NDArray]: ... @overload def cut( - x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + x: Index | npt.NDArray | Sequence[int] | Sequence[float], bins: IntervalIndex, right: bool = ..., labels: Sequence[Label] | None = ..., @@ -95,10 +96,10 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> tuple[Series, IntervalIndex]: ... +) -> tuple[Categorical, IntervalIndex]: ... @overload def cut( - x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + x: Index | npt.NDArray | Sequence[int] | Sequence[float], bins: int | Series | Int64Index @@ -114,10 +115,10 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> npt.NDArray: ... +) -> npt.NDArray[np.intp]: ... @overload def cut( - x: Categorical, + x: Series, bins: int | Series | Int64Index @@ -126,16 +127,16 @@ def cut( | Sequence[float] | IntervalIndex, right: bool = ..., - labels: Sequence[Label] | None = ..., + labels: Literal[False] | Sequence[Label] | None = ..., retbins: Literal[False] = ..., precision: int = ..., include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> Categorical: ... +) -> Series: ... @overload def cut( - x: Series | Index | npt.NDArray | Sequence[int] | Sequence[float], + x: Index | npt.NDArray | Sequence[int] | Sequence[float], bins: int | Series | Int64Index @@ -150,7 +151,7 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> Series: ... +) -> Categorical: ... @overload def qcut( x: npt.NDArray | Series, diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 2b2ff193e..b5a00d7c4 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -504,3 +504,62 @@ def test_index_unqiue() -> None: check(assert_type(pd.unique(tdi), Union[pd.Index, np.ndarray]), np.ndarray) check(assert_type(pd.unique(mi), Union[pd.Index, np.ndarray]), np.ndarray) check(assert_type(pd.unique(interval_i), pd.IntervalIndex), pd.IntervalIndex) + + +def test_cut() -> None: + a = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, precision=1, duplicates="drop") + b = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, labels=False, duplicates="raise") + c = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, labels=["1", "2", "3", "4"]) + check(assert_type(a, pd.Categorical), pd.Categorical) + check(assert_type(b, npt.NDArray[np.intp]), np.ndarray) + check(assert_type(c, pd.Categorical), pd.Categorical) + + d0, d1 = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, retbins=True) + e0, e1 = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, labels=False, retbins=True) + f0, f1 = pd.cut( + [1, 2, 3, 4, 5, 6, 7, 8], 4, labels=["1", "2", "3", "4"], retbins=True + ) + check(assert_type(d0, pd.Categorical), pd.Categorical) + check(assert_type(d1, npt.NDArray), np.ndarray) + check(assert_type(e0, npt.NDArray[np.intp]), np.ndarray) + check(assert_type(e1, npt.NDArray), np.ndarray) + check(assert_type(f0, pd.Categorical), pd.Categorical) + check(assert_type(f1, npt.NDArray), np.ndarray) + + g = pd.cut(pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), 4, precision=1, duplicates="drop") + h = pd.cut(pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), 4, labels=False, duplicates="raise") + i = pd.cut(pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), 4, labels=["1", "2", "3", "4"]) + check(assert_type(g, pd.Series), pd.Series) + check(assert_type(h, pd.Series), pd.Series) + check(assert_type(i, pd.Series), pd.Series) + + j0, j1 = pd.cut( + pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), + 4, + precision=1, + duplicates="drop", + retbins=True, + ) + k0, k1 = pd.cut( + pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), + 4, + labels=False, + duplicates="raise", + retbins=True, + ) + l0, l1 = pd.cut( + pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), + 4, + labels=["1", "2", "3", "4"], + retbins=True, + ) + check(assert_type(j0, pd.Series), pd.Series) + check(assert_type(j1, npt.NDArray), np.ndarray) + check(assert_type(k0, pd.Series), pd.Series) + check(assert_type(k1, npt.NDArray), np.ndarray) + check(assert_type(l0, pd.Series), pd.Series) + check(assert_type(l1, npt.NDArray), np.ndarray) + + +def test_qcut() -> None: + pass From 6c475f8d94b07c8aaac99c81f021d87460a3c409 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 23:01:51 +0100 Subject: [PATCH 10/20] TST: Add final test for cut --- tests/test_pandas.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index b5a00d7c4..d561159d1 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -507,6 +507,7 @@ def test_index_unqiue() -> None: def test_cut() -> None: + intval_idx = pd.interval_range(0, 10, 4) a = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, precision=1, duplicates="drop") b = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, labels=False, duplicates="raise") c = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], 4, labels=["1", "2", "3", "4"]) @@ -553,12 +554,23 @@ def test_cut() -> None: labels=["1", "2", "3", "4"], retbins=True, ) + m0, m1 = pd.cut( + pd.Series([1, 2, 3, 4, 5, 6, 7, 8]), + intval_idx, + retbins=True, + ) check(assert_type(j0, pd.Series), pd.Series) check(assert_type(j1, npt.NDArray), np.ndarray) check(assert_type(k0, pd.Series), pd.Series) check(assert_type(k1, npt.NDArray), np.ndarray) check(assert_type(l0, pd.Series), pd.Series) check(assert_type(l1, npt.NDArray), np.ndarray) + check(assert_type(m0, pd.Series), pd.Series) + check(assert_type(m1, pd.IntervalIndex), pd.IntervalIndex) + + n0, n1 = pd.cut([1, 2, 3, 4, 5, 6, 7, 8], intval_idx, retbins=True) + check(assert_type(n0, pd.Categorical), pd.Categorical) + check(assert_type(n1, pd.IntervalIndex), pd.IntervalIndex) def test_qcut() -> None: From 7ca155a8b6926c2871949415ea90f9eeb9d485f8 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Oct 2022 23:39:34 +0100 Subject: [PATCH 11/20] TST: Add tests and fixes for qcut --- pandas-stubs/core/reshape/tile.pyi | 30 +++++++------- tests/test_pandas.py | 64 +++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/pandas-stubs/core/reshape/tile.pyi b/pandas-stubs/core/reshape/tile.pyi index aafbcf8b2..1f7391bbe 100644 --- a/pandas-stubs/core/reshape/tile.pyi +++ b/pandas-stubs/core/reshape/tile.pyi @@ -154,8 +154,8 @@ def cut( ) -> Categorical: ... @overload def qcut( - x: npt.NDArray | Series, - q: int | Sequence[float] | Series[float] | Float64Index, + x: Index | npt.NDArray | Sequence[int] | Sequence[float], + q: int | Sequence[float] | Series[float] | Float64Index | npt.NDArray, *, labels: Literal[False], retbins: Literal[False] = ..., @@ -164,8 +164,8 @@ def qcut( ) -> npt.NDArray: ... @overload def qcut( - x: npt.NDArray, - q: int | Sequence[float] | Series[float] | Float64Index, + x: Index | npt.NDArray | Sequence[int] | Sequence[float], + q: int | Sequence[float] | Series[float] | Float64Index | npt.NDArray, labels: Sequence[Label] | None = ..., retbins: Literal[False] = ..., precision: int = ..., @@ -174,39 +174,39 @@ def qcut( @overload def qcut( x: Series, - q: int | Sequence[float] | Series[float] | Float64Index, - labels: Sequence[Label] | None = ..., + q: int | Sequence[float] | Series[float] | Float64Index | npt.NDArray, + labels: Literal[False] | Sequence[Label] | None = ..., retbins: Literal[False] = ..., precision: int = ..., duplicates: Literal["raise", "drop"] = ..., ) -> Series: ... @overload def qcut( - x: npt.NDArray | Series, - q: int | Sequence[float] | Series[float] | Float64Index, + x: Index | npt.NDArray | Sequence[int] | Sequence[float], + q: int | Sequence[float] | Series[float] | Float64Index | npt.NDArray, *, labels: Literal[False], retbins: Literal[True], precision: int = ..., duplicates: Literal["raise", "drop"] = ..., -) -> tuple[npt.NDArray, npt.NDArray]: ... +) -> tuple[npt.NDArray, npt.NDArray[np.float_]]: ... @overload def qcut( x: Series, - q: int | Sequence[float] | Series[float] | Float64Index, - labels: Sequence[Label] | None = ..., + q: int | Sequence[float] | Series[float] | Float64Index | npt.NDArray, + labels: Literal[False] | Sequence[Label] | None = ..., *, retbins: Literal[True], precision: int = ..., duplicates: Literal["raise", "drop"] = ..., -) -> tuple[Series, npt.NDArray]: ... +) -> tuple[Series, npt.NDArray[np.float_]]: ... @overload def qcut( - x: npt.NDArray, - q: int | Sequence[float] | Series[float] | Float64Index, + x: Index | npt.NDArray | Sequence[int] | Sequence[float], + q: int | Sequence[float] | Series[float] | Float64Index | npt.NDArray, labels: Sequence[Label] | None = ..., *, retbins: Literal[True], precision: int = ..., duplicates: Literal["raise", "drop"] = ..., -) -> tuple[Categorical, npt.NDArray]: ... +) -> tuple[Categorical, npt.NDArray[np.float_]]: ... diff --git a/tests/test_pandas.py b/tests/test_pandas.py index d561159d1..d8e3acedb 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -1,5 +1,6 @@ from __future__ import annotations +import random from typing import ( TYPE_CHECKING, Any, @@ -574,4 +575,65 @@ def test_cut() -> None: def test_qcut() -> None: - pass + val_list = [random.random() for _ in range(20)] + val_arr = np.array(val_list) + val_series = pd.Series(val_list) + val_idx = pd.Index(val_list) + + check( + assert_type( + pd.qcut(val_list, 4, precision=2, duplicates="raise"), pd.Categorical + ), + pd.Categorical, + ) + check( + assert_type( + pd.qcut(val_arr, 4, precision=2, duplicates="drop"), pd.Categorical + ), + pd.Categorical, + ) + check( + assert_type( + pd.qcut(val_idx, 4, precision=2, duplicates="drop"), pd.Categorical + ), + pd.Categorical, + ) + check( + assert_type(pd.qcut(val_series, 4, precision=2, duplicates="raise"), pd.Series), + pd.Series, + ) + + a0, a1 = pd.qcut(val_list, 4, retbins=True) + b0, b1 = pd.qcut(val_arr, 4, retbins=True) + c0, c1 = pd.qcut(val_idx, 4, retbins=True) + d0, d1 = pd.qcut(val_series, 4, retbins=True) + check(assert_type(a0, pd.Categorical), pd.Categorical) + check(assert_type(b0, pd.Categorical), pd.Categorical) + check(assert_type(c0, pd.Categorical), pd.Categorical) + check(assert_type(d0, pd.Series), pd.Series) + + check(assert_type(a1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(b1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(c1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(d1, npt.NDArray[np.float_]), np.ndarray) + + e0, e1 = pd.qcut(val_list, [0.25, 0.5, 0.75], retbins=True) + f0, f1 = pd.qcut(val_arr, np.array([0.25, 0.5, 0.75]), retbins=True) + g0, g1 = pd.qcut(val_idx, 4, retbins=True, labels=False) + h0, h1 = pd.qcut(val_series, 4, retbins=True, labels=False) + i0, i1 = pd.qcut(val_list, [0.25, 0.5, 0.75], retbins=True, labels=False) + j0, j1 = pd.qcut(val_arr, np.array([0.25, 0.5, 0.75]), retbins=True, labels=False) + + check(assert_type(e0, pd.Categorical), pd.Categorical) + check(assert_type(f0, pd.Categorical), pd.Categorical) + check(assert_type(g0, npt.NDArray), np.ndarray) + check(assert_type(h0, pd.Series), pd.Series) + check(assert_type(i0, npt.NDArray), np.ndarray) + check(assert_type(j0, npt.NDArray), np.ndarray) + + check(assert_type(e1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(f1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(g1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(h1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(i1, npt.NDArray[np.float_]), np.ndarray) + check(assert_type(j1, npt.NDArray[np.float_]), np.ndarray) From a0fb2c3d9a85d8f7c9f4d10497bd897092f04c4b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 07:33:07 +0100 Subject: [PATCH 12/20] TYP: Add typ ignore for overlapping defs --- pandas-stubs/core/algorithms.pyi | 10 ++++++---- tests/test_pandas.py | 26 ++++++++++++-------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas-stubs/core/algorithms.pyi b/pandas-stubs/core/algorithms.pyi index c28ebcff5..4ac2621a1 100644 --- a/pandas-stubs/core/algorithms.pyi +++ b/pandas-stubs/core/algorithms.pyi @@ -17,14 +17,16 @@ from pandas.api.extensions import ExtensionArray from pandas._typing import AnyArrayLike +# These are type: ignored because the Index types overlap due to inheritance but indices +# with extension types return the same type while standard type return ndarray @overload -def unique(values: PeriodIndex) -> PeriodIndex: ... +def unique(values: PeriodIndex) -> PeriodIndex: ... # type: ignore[misc] @overload -def unique(values: CategoricalIndex) -> CategoricalIndex: ... +def unique(values: CategoricalIndex) -> CategoricalIndex: ... # type: ignore[misc] @overload -def unique(values: IntervalIndex) -> IntervalIndex: ... +def unique(values: IntervalIndex) -> IntervalIndex: ... # type: ignore[misc] @overload -def unique(values: Index) -> Index | np.ndarray: ... +def unique(values: Index) -> np.ndarray: ... @overload def unique(values: Categorical) -> Categorical: ... @overload diff --git a/tests/test_pandas.py b/tests/test_pandas.py index d8e3acedb..7112b8638 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -209,7 +209,7 @@ def test_unique() -> None: ] ) ), - Union[pd.Index, np.ndarray], + np.ndarray, ), pd.DatetimeIndex, ) @@ -248,13 +248,11 @@ def test_unique() -> None: np.ndarray, ) check( - assert_type( - pd.unique(pd.Index(["a", "b", "c", "a"])), Union[pd.Index, np.ndarray] - ), + assert_type(pd.unique(pd.Index(["a", "b", "c", "a"])), np.ndarray), np.ndarray, ) check( - assert_type(pd.unique(pd.RangeIndex(0, 10)), Union[pd.Index, np.ndarray]), + assert_type(pd.unique(pd.RangeIndex(0, 10)), np.ndarray), np.ndarray, ) check( @@ -271,7 +269,7 @@ def test_unique() -> None: check( assert_type( pd.unique(pd.timedelta_range(start="1 day", periods=4)), - Union[pd.Index, np.ndarray], + np.ndarray, ), np.ndarray, ) @@ -495,15 +493,15 @@ def test_index_unqiue() -> None: interval_i = pd.interval_range(1, 10, periods=10) check(assert_type(pd.unique(ci), pd.CategoricalIndex), pd.CategoricalIndex) - check(assert_type(pd.unique(dti), Union[pd.Index, np.ndarray]), np.ndarray) - check(assert_type(pd.unique(fi), Union[pd.Index, np.ndarray]), np.ndarray) - check(assert_type(pd.unique(i), Union[pd.Index, np.ndarray]), np.ndarray) - check(assert_type(pd.unique(i64i), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(dti), np.ndarray), np.ndarray) + check(assert_type(pd.unique(fi), np.ndarray), np.ndarray) + check(assert_type(pd.unique(i), np.ndarray), np.ndarray) + check(assert_type(pd.unique(i64i), np.ndarray), np.ndarray) check(assert_type(pd.unique(pi), pd.PeriodIndex), pd.PeriodIndex) - check(assert_type(pd.unique(ri), Union[pd.Index, np.ndarray]), np.ndarray) - check(assert_type(pd.unique(ui), Union[pd.Index, np.ndarray]), np.ndarray) - check(assert_type(pd.unique(tdi), Union[pd.Index, np.ndarray]), np.ndarray) - check(assert_type(pd.unique(mi), Union[pd.Index, np.ndarray]), np.ndarray) + check(assert_type(pd.unique(ri), np.ndarray), np.ndarray) + check(assert_type(pd.unique(ui), np.ndarray), np.ndarray) + check(assert_type(pd.unique(tdi), np.ndarray), np.ndarray) + check(assert_type(pd.unique(mi), np.ndarray), np.ndarray) check(assert_type(pd.unique(interval_i), pd.IntervalIndex), pd.IntervalIndex) From 64f0c1a56942d4dd0064f4332c7f28575c5a24fd Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 07:49:19 +0100 Subject: [PATCH 13/20] TST: Add tests for merge --- pandas-stubs/core/reshape/merge.pyi | 2 +- tests/test_pandas.py | 164 ++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 1 deletion(-) diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index e2a09e76e..8709638ee 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -14,7 +14,7 @@ from pandas._typing import ( ) def merge( - # TODO: Verify Series is accepted and correct in docs + # TODO: Series is accepted -> correct in docs left: DataFrame | Series, right: DataFrame | Series, how: Literal["left", "right", "outer", "inner", "cross"] = ..., diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 7112b8638..0cdadc329 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -635,3 +635,167 @@ def test_qcut() -> None: check(assert_type(h1, npt.NDArray[np.float_]), np.ndarray) check(assert_type(i1, npt.NDArray[np.float_]), np.ndarray) check(assert_type(j1, npt.NDArray[np.float_]), np.ndarray) + + +def test_merge() -> None: + ls = pd.Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left") + rs = pd.Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right") + lf = pd.DataFrame(pd.Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left")) + rf = pd.DataFrame(pd.Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right")) + + check( + assert_type(pd.merge(ls, rs, left_on="left", right_on="right"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type( + pd.merge(ls, rs, how="left", left_on="left", right_on="right"), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge(ls, rs, how="right", left_on="left", right_on="right"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge(ls, rs, how="outer", left_on="left", right_on="right"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge(ls, rs, how="inner", left_on="left", right_on="right"), + pd.DataFrame, + ), + pd.DataFrame, + ) + # TOOD: When cross don't need on?? + check(assert_type(pd.merge(ls, rs, how="cross"), pd.DataFrame), pd.DataFrame) + check( + assert_type( + pd.merge(ls, rs, how="inner", left_index=True, right_index=True), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, + rs, + how="inner", + left_index=True, + right_index=True, + sort=True, + copy=True, + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, + rs, + how="inner", + left_index=True, + right_index=True, + suffixes=["_1", "_2"], + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, + rs, + how="inner", + left_index=True, + right_index=True, + suffixes=["_1", None], + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, + rs, + how="inner", + left_index=True, + right_index=True, + suffixes=("_1", None), + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, + rs, + how="inner", + left_index=True, + right_index=True, + suffixes=(None, "_2"), + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, + rs, + how="inner", + left_index=True, + right_index=True, + suffixes=("_1", "_2"), + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge( + ls, rs, how="inner", left_index=True, right_index=True, indicator=True + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type(pd.merge(lf, rs, left_on="left", right_on="right"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type(pd.merge(ls, rf, left_on="left", right_on="right"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type(pd.merge(lf, rf, left_on="left", right_on="right"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type( + pd.merge(lf, rf, left_on=["left"], right_on=["right"]), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type(pd.merge(lf, rf, left_index=True, right_index=True), pd.DataFrame), + pd.DataFrame, + ) From bc1b3c8fb8583a78eda331543903073425aeb93c Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 09:47:05 +0100 Subject: [PATCH 14/20] TST: Add tests for merge_ordered and improve typing --- pandas-stubs/core/reshape/merge.pyi | 53 +++++++++++-- tests/test_pandas.py | 118 ++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 8 deletions(-) diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index 8709638ee..6a9efc0e4 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -1,4 +1,7 @@ -from typing import Literal +from typing import ( + Literal, + overload, +) from pandas import ( DataFrame, @@ -32,14 +35,15 @@ def merge( indicator: bool | str = ..., validate: ValidationOptions = ..., ) -> DataFrame: ... +@overload def merge_ordered( - # TODO: Verify Series is accepted and correct in docs - left: DataFrame | Series, - # TODO: Verify Series is accepted and correct in docs - right: DataFrame | Series, - on: Label | list[HashableT] | AnyArrayLike | None = ..., - left_on: Label | list[HashableT] | AnyArrayLike | None = ..., - right_on: Label | list[HashableT] | AnyArrayLike | None = ..., + # TODO: Series is accepted -> correct in docs + left: DataFrame, + # TODO: Series is accepted -> correct in docs + right: DataFrame, + on: Label | list[HashableT] | None = ..., + left_on: Label | list[HashableT] | None = ..., + right_on: Label | list[HashableT] | None = ..., left_by: Label | list[HashableT] | None = ..., right_by: Label | list[HashableT] | None = ..., fill_method: Literal["ffill"] | None = ..., @@ -49,6 +53,39 @@ def merge_ordered( | tuple[str, None] = ..., how: Literal["left", "right", "outer", "inner"] = ..., ) -> DataFrame: ... +@overload +def merge_ordered( + left: Series, + right: DataFrame | Series, + on: Label | list[HashableT] | None = ..., + left_on: Label | list[HashableT] | None = ..., + right_on: Label | list[HashableT] | None = ..., + # TODO: Update docs since left_by, right_by must be None if either is a series + left_by: None = ..., + right_by: None = ..., + fill_method: Literal["ffill"] | None = ..., + suffixes: list[str | None] + | tuple[str, str] + | tuple[None, str] + | tuple[str, None] = ..., + how: Literal["left", "right", "outer", "inner"] = ..., +) -> DataFrame: ... +@overload +def merge_ordered( + left: DataFrame | Series, + right: Series, + on: Label | list[HashableT] | None = ..., + left_on: Label | list[HashableT] | None = ..., + right_on: Label | list[HashableT] | None = ..., + left_by: None = ..., + right_by: None = ..., + fill_method: Literal["ffill"] | None = ..., + suffixes: list[str | None] + | tuple[str, str] + | tuple[None, str] + | tuple[str, None] = ..., + how: Literal["left", "right", "outer", "inner"] = ..., +) -> DataFrame: ... def merge_asof( left: DataFrame | Series, right: DataFrame | Series, diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 0cdadc329..a20f11fc8 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -799,3 +799,121 @@ def test_merge() -> None: assert_type(pd.merge(lf, rf, left_index=True, right_index=True), pd.DataFrame), pd.DataFrame, ) + + +def test_merge_ordered() -> None: + ls = pd.Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left") + rs = pd.Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right") + lf = pd.DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7], [7, 8, 9]], + index=[1, 2, 3, 4], + columns=["a", "b", "c"], + ) + rf = pd.DataFrame(pd.Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="b")) + + check( + assert_type( + pd.merge_ordered(ls, rs, left_on="left", right_on="right"), pd.DataFrame + ), + pd.DataFrame, + ) + check(assert_type(pd.merge_ordered(lf, rf, on="b"), pd.DataFrame), pd.DataFrame) + check( + assert_type(pd.merge_ordered(lf, rf, left_on="a", right_on="b"), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, left_on="b", right_on="b", how="outer"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, left_on=["b"], right_on=["b"], how="outer"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, left_on="b", right_on="b", how="inner"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, left_on="b", right_on="b", how="left"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, left_on="b", right_on="b", how="right"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type(pd.merge_ordered(lf, rf, left_by="a"), pd.DataFrame), pd.DataFrame + ) + check( + assert_type( + pd.merge_ordered(lf, rf, left_by=["a", "c"], fill_method="ffill"), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, on="b", suffixes=["_1", None]), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, on="b", suffixes=("_1", None)), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, on="b", suffixes=(None, "_2")), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rf, on="b", suffixes=("_1", "_2")), pd.DataFrame + ), + pd.DataFrame, + ) + + +def test_merge_asof() -> None: + pass + + # def merge_asof( + # left: DataFrame | Series, + # right: DataFrame | Series, + # on: Label | None = ..., + # # TODO: Is AnyArrayLike accepted? Not in docs + # left_on: Label | AnyArrayLike | None = ..., + # # TODO: Is AnyArrayLike accepted? Not in docs + # right_on: Label | AnyArrayLike | None = ..., + # left_index: bool = ..., + # right_index: bool = ..., + # by: Label | list[HashableT] | None = ..., + # left_by: Label | None = ..., + # right_by: Label | None = ..., + # suffixes: list[str | None] + # | tuple[str, str] + # | tuple[None, str] + # | tuple[str, None] = ..., + # tolerance: int | Timedelta | None = ..., + # allow_exact_matches: bool = ..., + # direction: Literal["backward", "forward", "nearest"] = ..., + # ) -> DataFrame: ... From 0cf5ec310671aebc9bb3a7c29555828479aec5cd Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 10:20:09 +0100 Subject: [PATCH 15/20] TST: Add tests for merge_asof and improve typing accuracy --- pandas-stubs/core/reshape/merge.pyi | 10 +- tests/test_pandas.py | 192 ++++++++++++++++++++++++---- 2 files changed, 173 insertions(+), 29 deletions(-) diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index 6a9efc0e4..6e11f967d 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -90,15 +90,13 @@ def merge_asof( left: DataFrame | Series, right: DataFrame | Series, on: Label | None = ..., - # TODO: Is AnyArrayLike accepted? Not in docs - left_on: Label | AnyArrayLike | None = ..., - # TODO: Is AnyArrayLike accepted? Not in docs - right_on: Label | AnyArrayLike | None = ..., + left_on: Label | None = ..., + right_on: Label | None = ..., left_index: bool = ..., right_index: bool = ..., by: Label | list[HashableT] | None = ..., - left_by: Label | None = ..., - right_by: Label | None = ..., + left_by: Label | list[HashableT] | None = ..., + right_by: Label | list[HashableT] | None = ..., suffixes: list[str | None] | tuple[str, str] | tuple[None, str] diff --git a/tests/test_pandas.py b/tests/test_pandas.py index a20f11fc8..ee50c088e 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -894,26 +894,172 @@ def test_merge_ordered() -> None: def test_merge_asof() -> None: - pass - - # def merge_asof( - # left: DataFrame | Series, - # right: DataFrame | Series, - # on: Label | None = ..., - # # TODO: Is AnyArrayLike accepted? Not in docs - # left_on: Label | AnyArrayLike | None = ..., - # # TODO: Is AnyArrayLike accepted? Not in docs - # right_on: Label | AnyArrayLike | None = ..., - # left_index: bool = ..., - # right_index: bool = ..., - # by: Label | list[HashableT] | None = ..., - # left_by: Label | None = ..., - # right_by: Label | None = ..., - # suffixes: list[str | None] - # | tuple[str, str] - # | tuple[None, str] - # | tuple[str, None] = ..., - # tolerance: int | Timedelta | None = ..., - # allow_exact_matches: bool = ..., - # direction: Literal["backward", "forward", "nearest"] = ..., - # ) -> DataFrame: ... + ls = pd.Series([1, 2, 3, 4], index=[1, 2, 3, 4], name="left") + rs = pd.Series([3, 4, 5, 6], index=[3, 4, 5, 6], name="right") + lf = pd.DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7], [7, 8, 9]], + index=[1, 2, 3, 4], + columns=["a", "b", "c"], + ) + rf = pd.DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7], [7, 8, 9]], + index=[1, 2, 3, 4], + columns=["a", "b", "d"], + ) + + check( + assert_type( + pd.merge_asof(ls, rs, left_on="left", right_on="right"), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof(ls, rs, left_index=True, right_index=True), pd.DataFrame + ), + pd.DataFrame, + ) + + check(assert_type(pd.merge_asof(lf, rf, on="a"), pd.DataFrame), pd.DataFrame) + check( + assert_type(pd.merge_asof(lf, rf, left_on="a", right_on="b"), pd.DataFrame), + pd.DataFrame, + ) + + check( + assert_type(pd.merge_asof(lf, rf, on="a", by="b"), pd.DataFrame), pd.DataFrame + ) + check( + assert_type( + pd.merge_asof(lf, rf, left_on="c", right_on="d", by=["a", "b"]), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof(lf, rf, on="a", left_by=["c"], right_by=["d"]), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof(lf, rf, on="a", left_by=["b", "c"], right_by=["b", "d"]), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type(pd.merge_asof(lf, rf, on="a", suffixes=["_1", None]), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type(pd.merge_asof(lf, rf, on="a", suffixes=("_1", None)), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type(pd.merge_asof(lf, rf, on="a", suffixes=("_1", "_2")), pd.DataFrame), + pd.DataFrame, + ) + check( + assert_type(pd.merge_asof(lf, rf, on="a", suffixes=(None, "_2")), pd.DataFrame), + pd.DataFrame, + ) + + quotes = pd.DataFrame( + { + "time": [ + pd.Timestamp("2016-05-25 13:30:00.023"), + pd.Timestamp("2016-05-25 13:30:00.023"), + pd.Timestamp("2016-05-25 13:30:00.030"), + pd.Timestamp("2016-05-25 13:30:00.041"), + pd.Timestamp("2016-05-25 13:30:00.048"), + pd.Timestamp("2016-05-25 13:30:00.049"), + pd.Timestamp("2016-05-25 13:30:00.072"), + pd.Timestamp("2016-05-25 13:30:00.075"), + ], + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + } + ) + trades = pd.DataFrame( + { + "time": [ + pd.Timestamp("2016-05-25 13:30:00.023"), + pd.Timestamp("2016-05-25 13:30:00.038"), + pd.Timestamp("2016-05-25 13:30:00.048"), + pd.Timestamp("2016-05-25 13:30:00.048"), + pd.Timestamp("2016-05-25 13:30:00.048"), + ], + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.0], + "quantity": [75, 155, 100, 100, 100], + } + ) + + check( + assert_type( + pd.merge_asof( + trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("10ms") + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + direction="backward", + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + direction="forward", + ), + pd.DataFrame, + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + direction="nearest", + ), + pd.DataFrame, + ), + pd.DataFrame, + ) From be824c68704c6aec10c9ac730ccce4791a8da159 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 10:42:46 +0100 Subject: [PATCH 16/20] CLN: Remove TODO since pandas PR opened --- pandas-stubs/core/reshape/merge.pyi | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index 6e11f967d..552490753 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -17,7 +17,6 @@ from pandas._typing import ( ) def merge( - # TODO: Series is accepted -> correct in docs left: DataFrame | Series, right: DataFrame | Series, how: Literal["left", "right", "outer", "inner", "cross"] = ..., @@ -37,9 +36,7 @@ def merge( ) -> DataFrame: ... @overload def merge_ordered( - # TODO: Series is accepted -> correct in docs left: DataFrame, - # TODO: Series is accepted -> correct in docs right: DataFrame, on: Label | list[HashableT] | None = ..., left_on: Label | list[HashableT] | None = ..., @@ -60,7 +57,6 @@ def merge_ordered( on: Label | list[HashableT] | None = ..., left_on: Label | list[HashableT] | None = ..., right_on: Label | list[HashableT] | None = ..., - # TODO: Update docs since left_by, right_by must be None if either is a series left_by: None = ..., right_by: None = ..., fill_method: Literal["ffill"] | None = ..., From 1c122048d76749348d3a675261364686b94bb5d2 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 17:10:37 +0100 Subject: [PATCH 17/20] TYP: Final refinements --- pandas-stubs/core/reshape/tile.pyi | 6 +++--- tests/test_pandas.py | 33 ++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas-stubs/core/reshape/tile.pyi b/pandas-stubs/core/reshape/tile.pyi index 1f7391bbe..bc0a50e19 100644 --- a/pandas-stubs/core/reshape/tile.pyi +++ b/pandas-stubs/core/reshape/tile.pyi @@ -44,7 +44,7 @@ def cut( include_lowest: bool = ..., duplicates: Literal["raise", "drop"] = ..., ordered: bool = ..., -) -> tuple[npt.NDArray, IntervalIndex]: ... +) -> tuple[npt.NDArray[np.intp], IntervalIndex]: ... @overload def cut( x: Series, @@ -161,7 +161,7 @@ def qcut( retbins: Literal[False] = ..., precision: int = ..., duplicates: Literal["raise", "drop"] = ..., -) -> npt.NDArray: ... +) -> npt.NDArray[np.intp]: ... @overload def qcut( x: Index | npt.NDArray | Sequence[int] | Sequence[float], @@ -189,7 +189,7 @@ def qcut( retbins: Literal[True], precision: int = ..., duplicates: Literal["raise", "drop"] = ..., -) -> tuple[npt.NDArray, npt.NDArray[np.float_]]: ... +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.float_]]: ... @overload def qcut( x: Series, diff --git a/tests/test_pandas.py b/tests/test_pandas.py index ee50c088e..60bb8b687 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -17,7 +17,10 @@ from pandas._typing import Scalar -from tests import check +from tests import ( + TYPE_CHECKING_INVALID_USAGE, + check, +) def test_types_to_datetime() -> None: @@ -624,10 +627,10 @@ def test_qcut() -> None: check(assert_type(e0, pd.Categorical), pd.Categorical) check(assert_type(f0, pd.Categorical), pd.Categorical) - check(assert_type(g0, npt.NDArray), np.ndarray) + check(assert_type(g0, npt.NDArray[np.intp]), np.ndarray) check(assert_type(h0, pd.Series), pd.Series) - check(assert_type(i0, npt.NDArray), np.ndarray) - check(assert_type(j0, npt.NDArray), np.ndarray) + check(assert_type(i0, npt.NDArray[np.intp]), np.ndarray) + check(assert_type(j0, npt.NDArray[np.intp]), np.ndarray) check(assert_type(e1, npt.NDArray[np.float_]), np.ndarray) check(assert_type(f1, npt.NDArray[np.float_]), np.ndarray) @@ -817,6 +820,18 @@ def test_merge_ordered() -> None: ), pd.DataFrame, ) + check( + assert_type( + pd.merge_ordered(ls, rf, left_on="left", right_on="b"), pd.DataFrame + ), + pd.DataFrame, + ) + check( + assert_type( + pd.merge_ordered(lf, rs, left_on="a", right_on="right"), pd.DataFrame + ), + pd.DataFrame, + ) check(assert_type(pd.merge_ordered(lf, rf, on="b"), pd.DataFrame), pd.DataFrame) check( assert_type(pd.merge_ordered(lf, rf, left_on="a", right_on="b"), pd.DataFrame), @@ -891,6 +906,16 @@ def test_merge_ordered() -> None: ), pd.DataFrame, ) + if TYPE_CHECKING_INVALID_USAGE: + pd.merge_ordered( # type: ignore[call-overload] + ls, rs, left_on="left", right_on="right", left_by="left", right_by="right" + ) + pd.merge_ordered( # type: ignore[call-overload] + ls, rf, left_on="left", right_on="b", left_by="left", right_by="b" + ) + pd.merge_ordered( # type: ignore[call-overload] + lf, rs, left_on="a", right_on="right", left_by="a", right_by="right" + ) def test_merge_asof() -> None: From a5e43fd7bd66a2434a816860b5a7a4ba8bc8ba5b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 18:07:11 +0100 Subject: [PATCH 18/20] TST: Fix intentionally failing test --- tests/test_pandas.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 60bb8b687..742da67cb 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -908,13 +908,28 @@ def test_merge_ordered() -> None: ) if TYPE_CHECKING_INVALID_USAGE: pd.merge_ordered( # type: ignore[call-overload] - ls, rs, left_on="left", right_on="right", left_by="left", right_by="right" + ls, + rs, + left_on="left", + right_on="right", + left_by="left", # pyright: ignore + right_by="right", # pyright: ignore ) pd.merge_ordered( # type: ignore[call-overload] - ls, rf, left_on="left", right_on="b", left_by="left", right_by="b" + ls, + rf, # pyright: ignore + left_on="left", + right_on="b", + left_by="left", # pyright: ignore + right_by="b", # pyright: ignore ) pd.merge_ordered( # type: ignore[call-overload] - lf, rs, left_on="a", right_on="right", left_by="a", right_by="right" + lf, + rs, + left_on="a", + right_on="right", + left_by="a", # pyright: ignore + right_by="right", # pyright: ignore ) From 620e79927b7e177a55c57ab1822ab6cef55edb4b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 19:45:19 +0100 Subject: [PATCH 19/20] MAINT: Refactor MergeHow and add JoinHow --- pandas-stubs/_typing.pyi | 3 ++- pandas-stubs/core/frame.pyi | 5 +++-- pandas-stubs/core/series.pyi | 4 ++-- pandas-stubs/core/strings.pyi | 8 ++++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi index 4d6e53247..f297b2985 100644 --- a/pandas-stubs/_typing.pyi +++ b/pandas-stubs/_typing.pyi @@ -287,7 +287,8 @@ FillnaOptions: TypeAlias = Literal["backfill", "bfill", "ffill", "pad"] ReplaceMethod: TypeAlias = Literal["pad", "ffill", "bfill"] SortKind: TypeAlias = Literal["quicksort", "mergesort", "heapsort", "stable"] NaPosition: TypeAlias = Literal["first", "last"] -MergeHow: TypeAlias = Literal["left", "right", "outer", "inner"] +JoinHow: TypeAlias = Literal["left", "right", "outer", "inner"] +MergeHow: TypeAlias = Union[JoinHow, Literal["cross"]] JsonFrameOrient: TypeAlias = Literal[ "split", "records", "index", "columns", "values", "table" ] diff --git a/pandas-stubs/core/frame.pyi b/pandas-stubs/core/frame.pyi index de4dab9ca..e16943c91 100644 --- a/pandas-stubs/core/frame.pyi +++ b/pandas-stubs/core/frame.pyi @@ -74,6 +74,7 @@ from pandas._typing import ( IndexLabel, IndexType, IntervalClosedType, + JoinHow, JsonFrameOrient, Label, Level, @@ -532,7 +533,7 @@ class DataFrame(NDFrame, OpsMixin): def align( self, other: DataFrame | Series, - join: MergeHow = ..., + join: JoinHow = ..., axis: AxisType | None = ..., level: Level | None = ..., copy: _bool = ..., @@ -1102,7 +1103,7 @@ class DataFrame(NDFrame, OpsMixin): self, other: DataFrame | Series | list[DataFrame | Series], on: _str | list[_str] | None = ..., - how: MergeHow = ..., + how: JoinHow = ..., lsuffix: _str = ..., rsuffix: _str = ..., sort: _bool = ..., diff --git a/pandas-stubs/core/series.pyi b/pandas-stubs/core/series.pyi index 85c5de1b8..d44ac25f8 100644 --- a/pandas-stubs/core/series.pyi +++ b/pandas-stubs/core/series.pyi @@ -84,11 +84,11 @@ from pandas._typing import ( HashableT, IgnoreRaise, IndexingInt, + JoinHow, JsonSeriesOrient, Level, ListLike, MaskType, - MergeHow, NaPosition, QuantileInterpolation, Renamer, @@ -687,7 +687,7 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]): def align( self, other: DataFrame | Series, - join: MergeHow = ..., + join: JoinHow = ..., axis: AxisType | None = ..., level: Level | None = ..., copy: _bool = ..., diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi index b9ffceea0..800a2a821 100644 --- a/pandas-stubs/core/strings.pyi +++ b/pandas-stubs/core/strings.pyi @@ -19,7 +19,7 @@ from pandas import ( from pandas.core.base import NoNewAttributesMixin from pandas._typing import ( - MergeHow, + JoinHow, T, ) @@ -36,7 +36,7 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS]): *, sep: str, na_rep: str | None = ..., - join: MergeHow = ..., + join: JoinHow = ..., ) -> str: ... @overload def cat( @@ -45,7 +45,7 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS]): *, sep: str, na_rep: str | None = ..., - join: MergeHow = ..., + join: JoinHow = ..., ) -> str: ... @overload def cat( @@ -53,7 +53,7 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS]): others: Series | pd.Index | pd.DataFrame | np.ndarray | list[Any], sep: str = ..., na_rep: str | None = ..., - join: MergeHow = ..., + join: JoinHow = ..., ) -> T: ... @overload def split( From 7876a5e19eb062fc1a14483af59ebe2f063f5a24 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 7 Oct 2022 20:19:14 +0100 Subject: [PATCH 20/20] TYP: Correct use of MergeHow and add test --- pandas-stubs/core/reshape/merge.pyi | 10 ++++++---- tests/test_frame.py | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas-stubs/core/reshape/merge.pyi b/pandas-stubs/core/reshape/merge.pyi index 552490753..71d8651db 100644 --- a/pandas-stubs/core/reshape/merge.pyi +++ b/pandas-stubs/core/reshape/merge.pyi @@ -12,14 +12,16 @@ from pandas._libs.tslibs import Timedelta from pandas._typing import ( AnyArrayLike, HashableT, + JoinHow, Label, + MergeHow, ValidationOptions, ) def merge( left: DataFrame | Series, right: DataFrame | Series, - how: Literal["left", "right", "outer", "inner", "cross"] = ..., + how: MergeHow = ..., on: Label | list[HashableT] | AnyArrayLike | None = ..., left_on: Label | list[HashableT] | AnyArrayLike | None = ..., right_on: Label | list[HashableT] | AnyArrayLike | None = ..., @@ -48,7 +50,7 @@ def merge_ordered( | tuple[str, str] | tuple[None, str] | tuple[str, None] = ..., - how: Literal["left", "right", "outer", "inner"] = ..., + how: JoinHow = ..., ) -> DataFrame: ... @overload def merge_ordered( @@ -64,7 +66,7 @@ def merge_ordered( | tuple[str, str] | tuple[None, str] | tuple[str, None] = ..., - how: Literal["left", "right", "outer", "inner"] = ..., + how: JoinHow = ..., ) -> DataFrame: ... @overload def merge_ordered( @@ -80,7 +82,7 @@ def merge_ordered( | tuple[str, str] | tuple[None, str] | tuple[str, None] = ..., - how: Literal["left", "right", "outer", "inner"] = ..., + how: JoinHow = ..., ) -> DataFrame: ... def merge_asof( left: DataFrame | Series, diff --git a/tests/test_frame.py b/tests/test_frame.py index 093bd565a..63ef1148a 100644 --- a/tests/test_frame.py +++ b/tests/test_frame.py @@ -723,6 +723,7 @@ def test_types_merge() -> None: df.merge(df2, on=("col1", "col2"), how="left", suffixes=(None, "s")) df.merge(df2, on=("col1", "col2"), how="left", suffixes=("t", "s")) df.merge(df2, on=("col1", "col2"), how="left", suffixes=("a", None)) + df.merge(df2, how="cross") # GH 289 columns = ["col1", "col2"] df.merge(df2, on=columns)