diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py deleted file mode 100644 index f8cbadb987d29..0000000000000 --- a/pandas/tests/base/test_factorize.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas._testing as tm - - -@pytest.mark.parametrize("sort", [True, False]) -def test_factorize(index_or_series_obj, sort): - obj = index_or_series_obj - result_codes, result_uniques = obj.factorize(sort=sort) - - constructor = pd.Index - if isinstance(obj, pd.MultiIndex): - constructor = pd.MultiIndex.from_tuples - expected_uniques = constructor(obj.unique()) - - if sort: - expected_uniques = expected_uniques.sort_values() - - # construct an integer ndarray so that - # `expected_uniques.take(expected_codes)` is equal to `obj` - expected_uniques_list = list(expected_uniques) - expected_codes = [expected_uniques_list.index(val) for val in obj] - expected_codes = np.asarray(expected_codes, dtype=np.intp) - - tm.assert_numpy_array_equal(result_codes, expected_codes) - tm.assert_index_equal(result_uniques, expected_uniques) - - -def test_series_factorize_na_sentinel_none(): - # GH35667 - values = np.array([1, 2, 1, np.nan]) - ser = pd.Series(values) - codes, uniques = ser.factorize(na_sentinel=None) - - expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) - expected_uniques = pd.Index([1.0, 2.0, np.nan]) - - tm.assert_numpy_array_equal(codes, expected_codes) - tm.assert_index_equal(uniques, expected_uniques) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ee8e2385fe698..caaca38d07fa5 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -23,11 +23,21 @@ from pandas import ( Categorical, CategoricalIndex, + DataFrame, DatetimeIndex, Index, IntervalIndex, + MultiIndex, + NaT, + Period, + PeriodIndex, Series, + Timedelta, Timestamp, + date_range, + timedelta_range, + to_datetime, + to_timedelta, ) import pandas._testing as tm import pandas.core.algorithms as algos @@ -36,6 +46,40 @@ class TestFactorize: + @pytest.mark.parametrize("sort", [True, False]) + def test_factorize(self, index_or_series_obj, sort): + obj = index_or_series_obj + result_codes, result_uniques = obj.factorize(sort=sort) + + constructor = Index + if isinstance(obj, MultiIndex): + constructor = MultiIndex.from_tuples + expected_uniques = constructor(obj.unique()) + + if sort: + expected_uniques = expected_uniques.sort_values() + + # construct an integer ndarray so that + # `expected_uniques.take(expected_codes)` is equal to `obj` + expected_uniques_list = list(expected_uniques) + expected_codes = [expected_uniques_list.index(val) for val in obj] + expected_codes = np.asarray(expected_codes, dtype=np.intp) + + tm.assert_numpy_array_equal(result_codes, expected_codes) + tm.assert_index_equal(result_uniques, expected_uniques) + + def test_series_factorize_na_sentinel_none(self): + # GH#35667 + values = np.array([1, 2, 1, np.nan]) + ser = Series(values) + codes, uniques = ser.factorize(na_sentinel=None) + + expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) + expected_uniques = Index([1.0, 2.0, np.nan]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_index_equal(uniques, expected_uniques) + def test_basic(self): codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -111,34 +155,34 @@ def test_datelike(self): tm.assert_index_equal(uniques, exp) # period - v1 = pd.Period("201302", freq="M") - v2 = pd.Period("201303", freq="M") + v1 = Period("201302", freq="M") + v2 = Period("201303", freq="M") x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index codes, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) + tm.assert_index_equal(uniques, PeriodIndex([v1, v2])) codes, uniques = algos.factorize(x, sort=True) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) + tm.assert_index_equal(uniques, PeriodIndex([v1, v2])) # GH 5986 - v1 = pd.to_timedelta("1 day 1 min") - v2 = pd.to_timedelta("1 day") + v1 = to_timedelta("1 day 1 min") + v2 = to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) codes, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) + tm.assert_index_equal(uniques, to_timedelta([v1, v2])) codes, uniques = algos.factorize(x, sort=True) exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) + tm.assert_index_equal(uniques, to_timedelta([v2, v1])) def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] @@ -241,7 +285,7 @@ def test_string_factorize(self, writable): tm.assert_numpy_array_equal(uniques, expected_uniques) def test_object_factorize(self, writable): - data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) + data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object) data.setflags(write=writable) expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) expected_uniques = np.array(["a", "c", "b"], dtype=object) @@ -404,7 +448,7 @@ def test_object_refcount_bug(self): def test_on_index_object(self): - mindex = pd.MultiIndex.from_arrays( + mindex = MultiIndex.from_arrays( [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] ) expected = mindex.values @@ -456,7 +500,7 @@ def test_datetime64_dtype_array_returned(self): dtype="M8[ns]", ) - dt_index = pd.to_datetime( + dt_index = to_datetime( [ "2015-01-03T00:00:00.000000000", "2015-01-01T00:00:00.000000000", @@ -493,7 +537,7 @@ def test_timedelta64_dtype_array_returned(self): # GH 9431 expected = np.array([31200, 45678, 10000], dtype="m8[ns]") - td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) + td_index = to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype @@ -772,7 +816,7 @@ def test_basic(self): def test_i8(self): - arr = pd.date_range("20130101", periods=3).values + arr = date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -785,7 +829,7 @@ def test_i8(self): expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) - arr = pd.timedelta_range("1 day", periods=3).values + arr = timedelta_range("1 day", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -799,7 +843,7 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): - s = pd.date_range("20000101", periods=2000000, freq="s").values + s = date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) expected[0] = True @@ -950,27 +994,27 @@ def test_different_nans_as_float64(self): def test_isin_int_df_string_search(self): """Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1""" - df = pd.DataFrame({"values": [1, 2]}) + df = DataFrame({"values": [1, 2]}) result = df.isin(["1"]) - expected_false = pd.DataFrame({"values": [False, False]}) + expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_nan_df_string_search(self): """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") -> should not match values because np.nan is not equal str NaN""" - df = pd.DataFrame({"values": [np.nan, 2]}) + df = DataFrame({"values": [np.nan, 2]}) result = df.isin(["NaN"]) - expected_false = pd.DataFrame({"values": [False, False]}) + expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_float_df_string_search(self): """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") -> should not match values because float 1.4245 is not equal str 1.4245""" - df = pd.DataFrame({"values": [1.4245, 2.32441]}) + df = DataFrame({"values": [1.4245, 2.32441]}) result = df.isin(["1.4245"]) - expected_false = pd.DataFrame({"values": [False, False]}) + expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @@ -1016,8 +1060,8 @@ def test_value_counts_dtypes(self): algos.value_counts(["1", 1], bins=1) def test_value_counts_nat(self): - td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]") - dt = pd.to_datetime(["NaT", "2014-01-01"]) + td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]") + dt = to_datetime(["NaT", "2014-01-01"]) for s in [td, dt]: vc = algos.value_counts(s) @@ -1051,7 +1095,7 @@ def test_value_counts_datetime_outofbounds(self): tm.assert_series_equal(res, exp) # GH 12424 - res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) @@ -1323,9 +1367,9 @@ def test_datetime_likes(self): cases = [ np.array([Timestamp(d) for d in dt]), np.array([Timestamp(d, tz="US/Eastern") for d in dt]), - np.array([pd.Period(d, freq="D") for d in dt]), + np.array([Period(d, freq="D") for d in dt]), np.array([np.datetime64(d) for d in dt]), - np.array([pd.Timedelta(d) for d in td]), + np.array([Timedelta(d) for d in td]), ] exp_first = np.array( @@ -1530,7 +1574,7 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): s.loc[500] = np.nan elif htable == ht.PyObjectHashTable: # use different NaN types for object column - s.loc[500:502] = [np.nan, None, pd.NaT] + s.loc[500:502] = [np.nan, None, NaT] # create duplicated selection s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) @@ -1570,7 +1614,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): s.loc[500] = np.nan elif htable == ht.PyObjectHashTable: # use different NaN types for object column - s.loc[500:502] = [np.nan, None, pd.NaT] + s.loc[500:502] = [np.nan, None, NaT] # create duplicated selection s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) @@ -2307,7 +2351,7 @@ def test_diff_datetimelike_nat(self, dtype): tm.assert_numpy_array_equal(result, expected.T) def test_diff_ea_axis(self): - dta = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")._data + dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg):