From b6e52dec9fe6de0f7c9e2d4b7fd5036950b9b8c9 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 21:58:51 -0500 Subject: [PATCH 1/5] WIP --- pandas/conftest.py | 77 +++++++++++- pandas/tests/series/methods/test_rank.py | 144 ++++++++++------------- 2 files changed, 138 insertions(+), 83 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2862f7c957abc..51890790289ac 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -40,7 +40,8 @@ import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex - +from pandas._libs import iNaT +from pandas._libs.algos import Infinity, NegInfinity # ---------------------------------------------------------------- # Configuration / Settings @@ -591,6 +592,80 @@ def narrow_series(request): return _narrow_series[request.param].copy() +dtype_map = { + "float64": [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32": [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "uint8": [np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], + "int64": [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "object": [NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], +} +dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, +} + + +@pytest.fixture(params=dtype_map.keys()) +def rank_blub(request): + dtype = request.param + data = dtype_map[dtype] + values = np.array(data, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + # shuffle the testing array and expected results in the same way + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(exp_order[random_order], dtype="float64") + return iseries, exp + + _index_or_series_objs = {**indices_dict, **_series, **_narrow_series} diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 6d3c37659f5c4..7a301adfe7c36 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -206,88 +206,68 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize( - "contents,dtype", - [ - ( - [ - -np.inf, - -50, - -1, - -1e-20, - -1e-25, - -1e-50, - 0, - 1e-40, - 1e-20, - 1e-10, - 2, - 40, - np.inf, - ], - "float64", - ), - ( - [ - -np.inf, - -50, - -1, - -1e-20, - -1e-25, - -1e-45, - 0, - 1e-40, - 1e-20, - 1e-10, - 2, - 40, - np.inf, - ], - "float32", - ), - ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), - pytest.param( - [ - np.iinfo(np.int64).min, - -100, - 0, - 1, - 9999, - 100000, - 1e10, - np.iinfo(np.int64).max, - ], - "int64", - marks=pytest.mark.xfail( - reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue GH#16674" - ), - ), - ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), - ], - ) - def test_rank_inf(self, contents, dtype): - dtype_na_map = { - "float64": np.nan, - "float32": np.nan, - "int64": iNaT, - "object": None, - } - # Insert nans at random positions if underlying dtype has missing - # value. Then adjust the expected order by adding nans accordingly - # This is for testing whether rank calculation is affected - # when values are interwined with nan values. - values = np.array(contents, dtype=dtype) - exp_order = np.array(range(len(values)), dtype="float64") + 1.0 - if dtype in dtype_na_map: - na_value = dtype_na_map[dtype] - nan_indices = np.random.choice(range(len(values)), 5) - values = np.insert(values, nan_indices, na_value) - exp_order = np.insert(exp_order, nan_indices, np.nan) - # shuffle the testing array and expected results in the same way - random_order = np.random.permutation(len(values)) - iseries = Series(values[random_order]) - exp = Series(exp_order[random_order], dtype="float64") + # @pytest.mark.parametrize( + # "contents,dtype", + # [ + # ( + # [ + # -np.inf, + # -50, + # -1, + # -1e-20, + # -1e-25, + # -1e-50, + # 0, + # 1e-40, + # 1e-20, + # 1e-10, + # 2, + # 40, + # np.inf, + # ], + # "float64", + # ), + # ( + # [ + # -np.inf, + # -50, + # -1, + # -1e-20, + # -1e-25, + # -1e-45, + # 0, + # 1e-40, + # 1e-20, + # 1e-10, + # 2, + # 40, + # np.inf, + # ], + # "float32", + # ), + # ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + # pytest.param( + # [ + # np.iinfo(np.int64).min, + # -100, + # 0, + # 1, + # 9999, + # 100000, + # 1e10, + # np.iinfo(np.int64).max, + # ], + # "int64", + # marks=pytest.mark.xfail( + # reason="iNaT is equivalent to minimum value of dtype" + # "int64 pending issue GH#16674" + # ), + # ), + # ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + # ], + # ) + def test_rank_inf(self, rank_blub): + iseries, exp = rank_blub iranks = iseries.rank() tm.assert_series_equal(iranks, exp) From 863e4d65259f2bbc65ccb0384051c7ded499f620 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 1 Jan 2021 20:03:44 -0500 Subject: [PATCH 2/5] TST/CLN: deduplicate troublesome rank values --- pandas/conftest.py | 30 ++++++--- pandas/tests/frame/methods/test_rank.py | 86 ++---------------------- pandas/tests/series/methods/test_rank.py | 68 ++----------------- 3 files changed, 29 insertions(+), 155 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 51890790289ac..af0271f2c960c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -31,6 +31,8 @@ import pytest from pytz import FixedOffset, utc +from pandas._libs import iNaT +from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype @@ -40,8 +42,7 @@ import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex -from pandas._libs import iNaT -from pandas._libs.algos import Infinity, NegInfinity + # ---------------------------------------------------------------- # Configuration / Settings @@ -592,7 +593,7 @@ def narrow_series(request): return _narrow_series[request.param].copy() -dtype_map = { +_dtype_nuisance_arr_map = { "float64": [ -np.inf, -50, @@ -636,7 +637,8 @@ def narrow_series(request): ], "object": [NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], } -dtype_na_map = { + +_dtype_na_map = { "float64": np.nan, "float32": np.nan, "int64": iNaT, @@ -644,18 +646,28 @@ def narrow_series(request): } -@pytest.fixture(params=dtype_map.keys()) -def rank_blub(request): +@pytest.fixture(params=_dtype_nuisance_arr_map.keys()) +def nuisance_rank_series_and_expected(request): + """ + Fixture for Series with troublesome values for rank + algorithms + """ dtype = request.param - data = dtype_map[dtype] + if dtype == "int64": + mark = pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ) + request.node.add_marker(mark) + data = _dtype_nuisance_arr_map[dtype] values = np.array(data, dtype=dtype) exp_order = np.array(range(len(values)), dtype="float64") + 1.0 # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. - if dtype in dtype_na_map: - na_value = dtype_na_map[dtype] + if dtype in _dtype_na_map: + na_value = _dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) values = np.insert(values, nan_indices, na_value) exp_order = np.insert(exp_order, nan_indices, np.nan) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 991a91275ae1d..3f4b1aa8c6f81 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -332,88 +332,10 @@ def test_pct_max_many_rows(self): result = df.rank(pct=True).max() assert (result == 1).all() - @pytest.mark.parametrize( - "contents,dtype", - [ - ( - [ - -np.inf, - -50, - -1, - -1e-20, - -1e-25, - -1e-50, - 0, - 1e-40, - 1e-20, - 1e-10, - 2, - 40, - np.inf, - ], - "float64", - ), - ( - [ - -np.inf, - -50, - -1, - -1e-20, - -1e-25, - -1e-45, - 0, - 1e-40, - 1e-20, - 1e-10, - 2, - 40, - np.inf, - ], - "float32", - ), - ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), - pytest.param( - [ - np.iinfo(np.int64).min, - -100, - 0, - 1, - 9999, - 100000, - 1e10, - np.iinfo(np.int64).max, - ], - "int64", - marks=pytest.mark.xfail( - reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue GH#16674" - ), - ), - ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), - ], - ) - def test_rank_inf_and_nan(self, contents, dtype): - dtype_na_map = { - "float64": np.nan, - "float32": np.nan, - "int64": iNaT, - "object": None, - } - # Insert nans at random positions if underlying dtype has missing - # value. Then adjust the expected order by adding nans accordingly - # This is for testing whether rank calculation is affected - # when values are interwined with nan values. - values = np.array(contents, dtype=dtype) - exp_order = np.array(range(len(values)), dtype="float64") + 1.0 - if dtype in dtype_na_map: - na_value = dtype_na_map[dtype] - nan_indices = np.random.choice(range(len(values)), 5) - values = np.insert(values, nan_indices, na_value) - exp_order = np.insert(exp_order, nan_indices, np.nan) - # shuffle the testing array and expected results in the same way - random_order = np.random.permutation(len(values)) - df = DataFrame({"a": values[random_order]}) - expected = DataFrame({"a": exp_order[random_order]}, dtype="float64") + def test_rank_inf_and_nan(self, nuisance_rank_series_and_expected): + series, expected = nuisance_rank_series_and_expected + df = DataFrame({"a": series}) + expected = DataFrame({"a": expected}) result = df.rank() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 7a301adfe7c36..cdb2afe4763ea 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -206,70 +206,10 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - # @pytest.mark.parametrize( - # "contents,dtype", - # [ - # ( - # [ - # -np.inf, - # -50, - # -1, - # -1e-20, - # -1e-25, - # -1e-50, - # 0, - # 1e-40, - # 1e-20, - # 1e-10, - # 2, - # 40, - # np.inf, - # ], - # "float64", - # ), - # ( - # [ - # -np.inf, - # -50, - # -1, - # -1e-20, - # -1e-25, - # -1e-45, - # 0, - # 1e-40, - # 1e-20, - # 1e-10, - # 2, - # 40, - # np.inf, - # ], - # "float32", - # ), - # ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), - # pytest.param( - # [ - # np.iinfo(np.int64).min, - # -100, - # 0, - # 1, - # 9999, - # 100000, - # 1e10, - # np.iinfo(np.int64).max, - # ], - # "int64", - # marks=pytest.mark.xfail( - # reason="iNaT is equivalent to minimum value of dtype" - # "int64 pending issue GH#16674" - # ), - # ), - # ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), - # ], - # ) - def test_rank_inf(self, rank_blub): - iseries, exp = rank_blub - iranks = iseries.rank() - tm.assert_series_equal(iranks, exp) + def test_rank_inf(self, nuisance_rank_series_and_expected): + series, expected = nuisance_rank_series_and_expected + result = series.rank() + tm.assert_series_equal(result, expected) def test_rank_tie_methods(self): s = self.s From da4c76be582a9ecb64a2a82e1740f6ac1f4f68bb Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 1 Jan 2021 20:05:50 -0500 Subject: [PATCH 3/5] Remove unneeded imports --- pandas/tests/frame/methods/test_rank.py | 2 -- pandas/tests/series/methods/test_rank.py | 1 - 2 files changed, 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 3f4b1aa8c6f81..b5da822ee6746 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._libs import iNaT -from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td from pandas import DataFrame, Series diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index cdb2afe4763ea..d07cf16102d49 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -3,7 +3,6 @@ import numpy as np import pytest -from pandas._libs import iNaT from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td From c77a84f5d5d6a46e89942719db322840c9f803fd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 3 Jan 2021 13:40:21 -0500 Subject: [PATCH 4/5] Address comments --- pandas/conftest.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index af0271f2c960c..57268611a8c0a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -593,7 +593,8 @@ def narrow_series(request): return _narrow_series[request.param].copy() -_dtype_nuisance_arr_map = { +# Used in tests in (series|frame)/methods/test_rank.py +_nuisance_arr_for_rank_by_dtype = { "float64": [ -np.inf, -50, @@ -638,20 +639,19 @@ def narrow_series(request): "object": [NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], } -_dtype_na_map = { - "float64": np.nan, - "float32": np.nan, - "int64": iNaT, - "object": None, -} - -@pytest.fixture(params=_dtype_nuisance_arr_map.keys()) +@pytest.fixture(params=_nuisance_arr_for_rank_by_dtype.keys()) def nuisance_rank_series_and_expected(request): """ Fixture for Series with troublesome values for rank algorithms """ + _dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, + } dtype = request.param if dtype == "int64": mark = pytest.mark.xfail( @@ -659,7 +659,7 @@ def nuisance_rank_series_and_expected(request): "int64 pending issue GH#16674" ) request.node.add_marker(mark) - data = _dtype_nuisance_arr_map[dtype] + data = _nuisance_arr_for_rank_by_dtype[dtype] values = np.array(data, dtype=dtype) exp_order = np.array(range(len(values)), dtype="float64") + 1.0 # Insert nans at random positions if underlying dtype has missing From 4961e38ecb58977bb72adf4d6a3c5b1fef713501 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 3 Jan 2021 18:23:56 -0500 Subject: [PATCH 5/5] Use frame_or_series instead --- pandas/conftest.py | 87 ---------------------- pandas/tests/frame/methods/test_rank.py | 93 ++++++++++++++++++++++-- pandas/tests/series/methods/test_rank.py | 5 -- 3 files changed, 87 insertions(+), 98 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 57268611a8c0a..2862f7c957abc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -31,8 +31,6 @@ import pytest from pytz import FixedOffset, utc -from pandas._libs import iNaT -from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype @@ -593,91 +591,6 @@ def narrow_series(request): return _narrow_series[request.param].copy() -# Used in tests in (series|frame)/methods/test_rank.py -_nuisance_arr_for_rank_by_dtype = { - "float64": [ - -np.inf, - -50, - -1, - -1e-20, - -1e-25, - -1e-50, - 0, - 1e-40, - 1e-20, - 1e-10, - 2, - 40, - np.inf, - ], - "float32": [ - -np.inf, - -50, - -1, - -1e-20, - -1e-25, - -1e-45, - 0, - 1e-40, - 1e-20, - 1e-10, - 2, - 40, - np.inf, - ], - "uint8": [np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], - "int64": [ - np.iinfo(np.int64).min, - -100, - 0, - 1, - 9999, - 100000, - 1e10, - np.iinfo(np.int64).max, - ], - "object": [NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], -} - - -@pytest.fixture(params=_nuisance_arr_for_rank_by_dtype.keys()) -def nuisance_rank_series_and_expected(request): - """ - Fixture for Series with troublesome values for rank - algorithms - """ - _dtype_na_map = { - "float64": np.nan, - "float32": np.nan, - "int64": iNaT, - "object": None, - } - dtype = request.param - if dtype == "int64": - mark = pytest.mark.xfail( - reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue GH#16674" - ) - request.node.add_marker(mark) - data = _nuisance_arr_for_rank_by_dtype[dtype] - values = np.array(data, dtype=dtype) - exp_order = np.array(range(len(values)), dtype="float64") + 1.0 - # Insert nans at random positions if underlying dtype has missing - # value. Then adjust the expected order by adding nans accordingly - # This is for testing whether rank calculation is affected - # when values are interwined with nan values. - if dtype in _dtype_na_map: - na_value = _dtype_na_map[dtype] - nan_indices = np.random.choice(range(len(values)), 5) - values = np.insert(values, nan_indices, na_value) - exp_order = np.insert(exp_order, nan_indices, np.nan) - # shuffle the testing array and expected results in the same way - random_order = np.random.permutation(len(values)) - iseries = Series(values[random_order]) - exp = Series(exp_order[random_order], dtype="float64") - return iseries, exp - - _index_or_series_objs = {**indices_dict, **_series, **_narrow_series} diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index b5da822ee6746..6ad1b475e28a2 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._libs import iNaT +from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td from pandas import DataFrame, Series @@ -330,12 +332,91 @@ def test_pct_max_many_rows(self): result = df.rank(pct=True).max() assert (result == 1).all() - def test_rank_inf_and_nan(self, nuisance_rank_series_and_expected): - series, expected = nuisance_rank_series_and_expected - df = DataFrame({"a": series}) - expected = DataFrame({"a": expected}) - result = df.rank() - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + pytest.param( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + marks=pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ), + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ], + ) + def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): + dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + + # Shuffle the testing array and expected results in the same way + random_order = np.random.permutation(len(values)) + obj = frame_or_series(values[random_order]) + expected = frame_or_series(exp_order[random_order], dtype="float64") + result = obj.rank() + tm.assert_equal(result, expected) def test_df_series_inf_nan_consistency(self): # GH#32593 diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index d07cf16102d49..9d052e2236aae 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -205,11 +205,6 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - def test_rank_inf(self, nuisance_rank_series_and_expected): - series, expected = nuisance_rank_series_and_expected - result = series.rank() - tm.assert_series_equal(result, expected) - def test_rank_tie_methods(self): s = self.s