Skip to content

TST/CLN: misplaced factorize tests #37411

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 0 additions & 41 deletions pandas/tests/base/test_factorize.py

This file was deleted.

102 changes: 73 additions & 29 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,21 @@
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
IntervalIndex,
MultiIndex,
NaT,
Period,
PeriodIndex,
Series,
Timedelta,
Timestamp,
date_range,
timedelta_range,
to_datetime,
to_timedelta,
)
import pandas._testing as tm
import pandas.core.algorithms as algos
Expand All @@ -36,6 +46,40 @@


class TestFactorize:
@pytest.mark.parametrize("sort", [True, False])
def test_factorize(self, index_or_series_obj, sort):
obj = index_or_series_obj
result_codes, result_uniques = obj.factorize(sort=sort)

constructor = Index
if isinstance(obj, MultiIndex):
constructor = MultiIndex.from_tuples
expected_uniques = constructor(obj.unique())

if sort:
expected_uniques = expected_uniques.sort_values()

# construct an integer ndarray so that
# `expected_uniques.take(expected_codes)` is equal to `obj`
expected_uniques_list = list(expected_uniques)
expected_codes = [expected_uniques_list.index(val) for val in obj]
expected_codes = np.asarray(expected_codes, dtype=np.intp)

tm.assert_numpy_array_equal(result_codes, expected_codes)
tm.assert_index_equal(result_uniques, expected_uniques)

def test_series_factorize_na_sentinel_none(self):
# GH#35667
values = np.array([1, 2, 1, np.nan])
ser = Series(values)
codes, uniques = ser.factorize(na_sentinel=None)

expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
expected_uniques = Index([1.0, 2.0, np.nan])

tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_index_equal(uniques, expected_uniques)

def test_basic(self):

codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
Expand Down Expand Up @@ -111,34 +155,34 @@ def test_datelike(self):
tm.assert_index_equal(uniques, exp)

# period
v1 = pd.Period("201302", freq="M")
v2 = pd.Period("201303", freq="M")
v1 = Period("201302", freq="M")
v2 = Period("201303", freq="M")
x = Series([v1, v1, v1, v2, v2, v1])

# periods are not 'sorted' as they are converted back into an index
codes, uniques = algos.factorize(x)
exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))

codes, uniques = algos.factorize(x, sort=True)
exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))

# GH 5986
v1 = pd.to_timedelta("1 day 1 min")
v2 = pd.to_timedelta("1 day")
v1 = to_timedelta("1 day 1 min")
v2 = to_timedelta("1 day")
x = Series([v1, v2, v1, v1, v2, v2, v1])
codes, uniques = algos.factorize(x)
exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))
tm.assert_index_equal(uniques, to_timedelta([v1, v2]))

codes, uniques = algos.factorize(x, sort=True)
exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
tm.assert_index_equal(uniques, to_timedelta([v2, v1]))

def test_factorize_nan(self):
# nan should map to na_sentinel, not reverse_indexer[na_sentinel]
Expand Down Expand Up @@ -241,7 +285,7 @@ def test_string_factorize(self, writable):
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_object_factorize(self, writable):
data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object)
data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
data.setflags(write=writable)
expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
expected_uniques = np.array(["a", "c", "b"], dtype=object)
Expand Down Expand Up @@ -404,7 +448,7 @@ def test_object_refcount_bug(self):

def test_on_index_object(self):

mindex = pd.MultiIndex.from_arrays(
mindex = MultiIndex.from_arrays(
[np.arange(5).repeat(5), np.tile(np.arange(5), 5)]
)
expected = mindex.values
Expand Down Expand Up @@ -456,7 +500,7 @@ def test_datetime64_dtype_array_returned(self):
dtype="M8[ns]",
)

dt_index = pd.to_datetime(
dt_index = to_datetime(
[
"2015-01-03T00:00:00.000000000",
"2015-01-01T00:00:00.000000000",
Expand Down Expand Up @@ -493,7 +537,7 @@ def test_timedelta64_dtype_array_returned(self):
# GH 9431
expected = np.array([31200, 45678, 10000], dtype="m8[ns]")

td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
result = algos.unique(td_index)
tm.assert_numpy_array_equal(result, expected)
assert result.dtype == expected.dtype
Expand Down Expand Up @@ -772,7 +816,7 @@ def test_basic(self):

def test_i8(self):

arr = pd.date_range("20130101", periods=3).values
arr = date_range("20130101", periods=3).values
result = algos.isin(arr, [arr[0]])
expected = np.array([True, False, False])
tm.assert_numpy_array_equal(result, expected)
Expand All @@ -785,7 +829,7 @@ def test_i8(self):
expected = np.array([True, True, False])
tm.assert_numpy_array_equal(result, expected)

arr = pd.timedelta_range("1 day", periods=3).values
arr = timedelta_range("1 day", periods=3).values
result = algos.isin(arr, [arr[0]])
expected = np.array([True, False, False])
tm.assert_numpy_array_equal(result, expected)
Expand All @@ -799,7 +843,7 @@ def test_i8(self):
tm.assert_numpy_array_equal(result, expected)

def test_large(self):
s = pd.date_range("20000101", periods=2000000, freq="s").values
s = date_range("20000101", periods=2000000, freq="s").values
result = algos.isin(s, s[0:2])
expected = np.zeros(len(s), dtype=bool)
expected[0] = True
Expand Down Expand Up @@ -950,27 +994,27 @@ def test_different_nans_as_float64(self):
def test_isin_int_df_string_search(self):
"""Comparing df with int`s (1,2) with a string at isin() ("1")
-> should not match values because int 1 is not equal str 1"""
df = pd.DataFrame({"values": [1, 2]})
df = DataFrame({"values": [1, 2]})
result = df.isin(["1"])
expected_false = pd.DataFrame({"values": [False, False]})
expected_false = DataFrame({"values": [False, False]})
tm.assert_frame_equal(result, expected_false)

@pytest.mark.xfail(reason="problem related with issue #34125")
def test_isin_nan_df_string_search(self):
"""Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
-> should not match values because np.nan is not equal str NaN"""
df = pd.DataFrame({"values": [np.nan, 2]})
df = DataFrame({"values": [np.nan, 2]})
result = df.isin(["NaN"])
expected_false = pd.DataFrame({"values": [False, False]})
expected_false = DataFrame({"values": [False, False]})
tm.assert_frame_equal(result, expected_false)

@pytest.mark.xfail(reason="problem related with issue #34125")
def test_isin_float_df_string_search(self):
"""Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
-> should not match values because float 1.4245 is not equal str 1.4245"""
df = pd.DataFrame({"values": [1.4245, 2.32441]})
df = DataFrame({"values": [1.4245, 2.32441]})
result = df.isin(["1.4245"])
expected_false = pd.DataFrame({"values": [False, False]})
expected_false = DataFrame({"values": [False, False]})
tm.assert_frame_equal(result, expected_false)


Expand Down Expand Up @@ -1016,8 +1060,8 @@ def test_value_counts_dtypes(self):
algos.value_counts(["1", 1], bins=1)

def test_value_counts_nat(self):
td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]")
dt = pd.to_datetime(["NaT", "2014-01-01"])
td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
dt = to_datetime(["NaT", "2014-01-01"])

for s in [td, dt]:
vc = algos.value_counts(s)
Expand Down Expand Up @@ -1051,7 +1095,7 @@ def test_value_counts_datetime_outofbounds(self):
tm.assert_series_equal(res, exp)

# GH 12424
res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
exp = Series(["2362-01-01", np.nan], dtype=object)
tm.assert_series_equal(res, exp)

Expand Down Expand Up @@ -1323,9 +1367,9 @@ def test_datetime_likes(self):
cases = [
np.array([Timestamp(d) for d in dt]),
np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
np.array([pd.Period(d, freq="D") for d in dt]),
np.array([Period(d, freq="D") for d in dt]),
np.array([np.datetime64(d) for d in dt]),
np.array([pd.Timedelta(d) for d in td]),
np.array([Timedelta(d) for d in td]),
]

exp_first = np.array(
Expand Down Expand Up @@ -1530,7 +1574,7 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
s.loc[500] = np.nan
elif htable == ht.PyObjectHashTable:
# use different NaN types for object column
s.loc[500:502] = [np.nan, None, pd.NaT]
s.loc[500:502] = [np.nan, None, NaT]

# create duplicated selection
s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
Expand Down Expand Up @@ -1570,7 +1614,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
s.loc[500] = np.nan
elif htable == ht.PyObjectHashTable:
# use different NaN types for object column
s.loc[500:502] = [np.nan, None, pd.NaT]
s.loc[500:502] = [np.nan, None, NaT]

# create duplicated selection
s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
Expand Down Expand Up @@ -2307,7 +2351,7 @@ def test_diff_datetimelike_nat(self, dtype):
tm.assert_numpy_array_equal(result, expected.T)

def test_diff_ea_axis(self):
dta = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")._data
dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data

msg = "cannot diff DatetimeArray on axis=1"
with pytest.raises(ValueError, match=msg):
Expand Down