diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index f0a855b575139..eec722c9f167b 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -268,7 +268,7 @@ def time_get_dummies(self, dtype): class Encode: def setup(self): - self.ser = Series(tm.makeUnicodeIndex()) + self.ser = Series(tm.makeStringIndex()) def time_encode_decode(self): self.ser.str.encode("utf-8").str.decode("utf-8") diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 96f37bd47e10c..603c2f081a31a 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -62,7 +62,6 @@ randbool, rands, rands_array, - randu_array, ) from pandas._testing._warnings import ( # noqa:F401 assert_produces_warning, @@ -305,10 +304,6 @@ def makeStringIndex(k=10, name=None): return Index(rands_array(nchars=10, size=k), name=name) -def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k), name=name) - - def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """make a length k index or n categories""" x = rands_array(nchars=4, size=n, replace=False) @@ -521,10 +516,10 @@ def makeCustomIndex( label will repeated at the corresponding level, you can specify just the first few, the rest will use the default ndupe_l of 1. len(ndupe_l) <= nlevels. - idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". + idx_type - "i"/"f"/"s"/"dt"/"p"/"td". If idx_type is not None, `idx_nlevels` must be 1. "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index + "s" creates a string "dt" create a datetime index. "td" create a datetime index. @@ -554,7 +549,6 @@ def makeCustomIndex( "i": makeIntIndex, "f": makeFloatIndex, "s": makeStringIndex, - "u": makeUnicodeIndex, "dt": makeDateIndex, "td": makeTimedeltaIndex, "p": makePeriodIndex, @@ -569,7 +563,7 @@ def makeCustomIndex( elif idx_type is not None: raise ValueError( f"{repr(idx_type)} is not a legal value for `idx_type`, " - "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." + "use 'i'/'f'/'s'/'dt'/'p'/'td'." ) if len(ndupe_l) < nlevels: @@ -651,10 +645,10 @@ def makeCustomDataframe( nrows/ncol, the last label might have lower multiplicity. dtype - passed to the DataFrame constructor as is, in case you wish to have more control in conjunction with a custom `data_gen_f` - r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". + r_idx_type, c_idx_type - "i"/"f"/"s"/"dt"/"td". If idx_type is not None, `idx_nlevels` must be 1. "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index + "s" creates a string index "dt" create a datetime index. "td" create a timedelta index. @@ -689,10 +683,10 @@ def makeCustomDataframe( assert c_idx_nlevels > 0 assert r_idx_nlevels > 0 assert r_idx_type is None or ( - r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + r_idx_type in ("i", "f", "s", "dt", "p", "td") and r_idx_nlevels == 1 ) assert c_idx_type is None or ( - c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + c_idx_type in ("i", "f", "s", "dt", "p", "td") and c_idx_nlevels == 1 ) columns = makeCustomIndex( diff --git a/pandas/_testing/_random.py b/pandas/_testing/_random.py index f90e18a2020c5..cce6bf8da7d3e 100644 --- a/pandas/_testing/_random.py +++ b/pandas/_testing/_random.py @@ -26,18 +26,6 @@ def rands_array(nchars, size, dtype="O", replace=True): return retval.astype(dtype) -def randu_array(nchars, size, dtype="O"): - """ - Generate an array of unicode strings. - """ - retval = ( - np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) - .view((np.unicode_, nchars)) - .reshape(size) - ) - return retval.astype(dtype) - - def rands(nchars): """ Generate one random byte string. diff --git a/pandas/conftest.py b/pandas/conftest.py index dc03f081388b8..d330c2de9d23f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -543,7 +543,6 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "unicode": tm.makeUnicodeIndex(100), "string": tm.makeStringIndex(100), "datetime": tm.makeDateIndex(100), "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 910449d98bcc5..956e01ec5bde9 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -730,7 +730,7 @@ def should_warn(*args): class TestAlignment: - index_types = ["i", "u", "dt"] + index_types = ["i", "s", "dt"] lhs_index_types = index_types + ["s"] # 'p' def test_align_nested_unary_op(self, engine, parser): @@ -829,7 +829,7 @@ def test_basic_frame_series_alignment( @pytest.mark.parametrize("index_name", ["index", "columns"]) @pytest.mark.parametrize( "r_idx_type, c_idx_type", - list(product(["i", "u", "s"], ["i", "u", "s"])) + [("dt", "dt")], + list(product(["i", "s"], ["i", "s"])) + [("dt", "dt")], ) @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_basic_series_frame_alignment( diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index b7874d51b6f33..01009d6df3920 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -346,7 +346,7 @@ def test_to_csv_nrows(self, nrows): "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] ) @pytest.mark.parametrize( - "r_idx_type, c_idx_type", [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")] + "r_idx_type, c_idx_type", [("i", "i"), ("s", "s"), ("s", "dt"), ("p", "p")] ) @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 85602fdf7274a..5f1a81c504efe 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -150,24 +150,26 @@ def test_indices_grouped_by_tuple_with_lambda(self): class TestGrouping: - def test_grouper_index_types(self): - # related GH5375 - # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) - for index in [ + @pytest.mark.parametrize( + "index", + [ tm.makeFloatIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, - ]: + ], + ) + def test_grouper_index_types(self, index): + # related GH5375 + # groupby misbehaving when using a Floatlike index + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) - df.index = index(len(df)) - df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) + df.index = index(len(df)) + df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) - df.index = list(reversed(df.index.tolist())) - df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) + df.index = list(reversed(df.index.tolist())) + df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) def test_grouper_multilevel_freq(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 03dc2d5f1a617..55f3e27be5a72 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -318,7 +318,6 @@ def test_view_with_args(self, index): @pytest.mark.parametrize( "index", [ - "unicode", "string", pytest.param("categorical", marks=pytest.mark.xfail(reason="gh-25464")), "bool-object", @@ -927,7 +926,7 @@ def test_slice_keep_name(self): @pytest.mark.parametrize( "index", - ["unicode", "string", "datetime", "int", "uint", "float"], + ["string", "datetime", "int", "uint", "float"], indirect=True, ) def test_join_self(self, index, join_type): diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 0becdf5333ab7..186cba62c138f 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -45,7 +45,6 @@ def check(self, result, original, indexer, getitem): "index_func", [ tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeCategoricalIndex, tm.makeDateIndex, tm.makeTimedeltaIndex, @@ -83,7 +82,6 @@ def test_scalar_non_numeric(self, index_func, frame_or_series, indexer_sl): "index_func", [ tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeCategoricalIndex, tm.makeDateIndex, tm.makeTimedeltaIndex, @@ -220,7 +218,6 @@ def test_scalar_float(self, frame_or_series): "index_func", [ tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeDateIndex, tm.makeTimedeltaIndex, tm.makePeriodIndex, diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 3c807a0ad0567..04f4b0a313fdc 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -950,36 +950,35 @@ def test_to_string_with_column_specific_col_space(self): result = df.to_string(col_space=[10, 11, 12]) assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) - def test_to_string_truncate_indices(self): - for index in [ + @pytest.mark.parametrize( + "index", + [ tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, - ]: - for column in [tm.makeStringIndex]: - for h in [10, 20]: - for w in [10, 20]: - with option_context("display.expand_frame_repr", False): - df = DataFrame(index=index(h), columns=column(w)) - with option_context("display.max_rows", 15): - if h == 20: - assert has_vertically_truncated_repr(df) - else: - assert not has_vertically_truncated_repr(df) - with option_context("display.max_columns", 15): - if w == 20: - assert has_horizontally_truncated_repr(df) - else: - assert not (has_horizontally_truncated_repr(df)) - with option_context( - "display.max_rows", 15, "display.max_columns", 15 - ): - if h == 20 and w == 20: - assert has_doubly_truncated_repr(df) - else: - assert not has_doubly_truncated_repr(df) + ], + ) + @pytest.mark.parametrize("h", [10, 20]) + @pytest.mark.parametrize("w", [10, 20]) + def test_to_string_truncate_indices(self, index, h, w): + with option_context("display.expand_frame_repr", False): + df = DataFrame(index=index(h), columns=tm.makeStringIndex(w)) + with option_context("display.max_rows", 15): + if h == 20: + assert has_vertically_truncated_repr(df) + else: + assert not has_vertically_truncated_repr(df) + with option_context("display.max_columns", 15): + if w == 20: + assert has_horizontally_truncated_repr(df) + else: + assert not (has_horizontally_truncated_repr(df)) + with option_context("display.max_rows", 15, "display.max_columns", 15): + if h == 20 and w == 20: + assert has_doubly_truncated_repr(df) + else: + assert not has_doubly_truncated_repr(df) def test_to_string_truncate_multilevel(self): arrays = [ diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 6c74a4bcc5e02..d4d95186110f8 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -227,8 +227,6 @@ def test_put_mixed_type(setup_path): ["fixed", tm.makeDateIndex], ["table", tm.makePeriodIndex], # GH#7796 ["fixed", tm.makePeriodIndex], - ["table", tm.makeUnicodeIndex], - ["fixed", tm.makeUnicodeIndex], ], ) def test_store_index_types(setup_path, format, index): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 90f45f6e01469..8a933f4981ff3 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -993,7 +993,6 @@ def test_to_hdf_with_object_column_names(setup_path): types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex, - tm.makeUnicodeIndex, ] for index in types_should_fail: diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 91ea5e09aa013..8ce1d717b6657 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -20,15 +20,17 @@ import pandas._testing as tm from pandas.util import _test_decorators as td -df1 = DataFrame( - { - "int": [1, 3], - "float": [2.0, np.nan], - "str": ["t", "s"], - "dt": date_range("2018-06-18", periods=2), - } -) -text = str(df1.to_csv(index=False)).encode() + +@pytest.fixture +def df1(): + return DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) @pytest.fixture @@ -40,7 +42,8 @@ def cleared_fs(): memfs.store.clear() -def test_read_csv(cleared_fs): +def test_read_csv(cleared_fs, df1): + text = str(df1.to_csv(index=False)).encode() with cleared_fs.open("test/test.csv", "wb") as w: w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) @@ -65,7 +68,7 @@ def test_reasonable_error(monkeypatch, cleared_fs): read_csv("couldexist://test/test.csv") -def test_to_csv(cleared_fs): +def test_to_csv(cleared_fs, df1): df1.to_csv("memory://test/test.csv", index=True) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) @@ -74,7 +77,7 @@ def test_to_csv(cleared_fs): @pytest.mark.parametrize("ext", ["xls", "xlsx"]) -def test_to_excel(cleared_fs, ext): +def test_to_excel(cleared_fs, ext, df1): if ext == "xls": pytest.importorskip("xlwt") else: @@ -89,7 +92,7 @@ def test_to_excel(cleared_fs, ext): @pytest.mark.parametrize("binary_mode", [False, True]) -def test_to_csv_fsspec_object(cleared_fs, binary_mode): +def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): fsspec = pytest.importorskip("fsspec") path = "memory://test/test.csv" @@ -153,7 +156,7 @@ def test_excel_options(fsspectest, extension): @td.skip_if_no("fastparquet") -def test_to_parquet_new_file(cleared_fs): +def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" df1.to_parquet( "memory://test/test.csv", index=True, engine="fastparquet", compression=None @@ -230,7 +233,7 @@ def test_s3_protocols(s3_resource, tips_file, protocol, s3so): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource, s3so): +def test_s3_parquet(s3_resource, s3so, df1): fn = "s3://pandas-test/test.parquet" df1.to_parquet( fn, index=False, engine="fastparquet", compression=None, storage_options=s3so diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 7677b8950c7a3..9d33e52709bd2 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -38,7 +38,6 @@ def get_objs(): tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), tm.makePeriodIndex(10, name="a"), tm.makeStringIndex(10, name="a"), - tm.makeUnicodeIndex(10, name="a"), ] arr = np.random.randn(10) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 9f2635beaf1e7..4498f11d77313 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -81,7 +81,7 @@ def f(df): "name, func", [ ("Int64Index", tm.makeIntIndex), - ("Index", tm.makeUnicodeIndex), + ("Index", tm.makeStringIndex), ("Float64Index", tm.makeFloatIndex), ("MultiIndex", lambda m: tm.makeCustomIndex(m, 2)), ], diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 80e34efb80789..9a0c3fd5e9fed 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -70,7 +70,6 @@ def test_tab_completion_with_categorical(self): @pytest.mark.parametrize( "index", [ - tm.makeUnicodeIndex(10), tm.makeStringIndex(10), tm.makeCategoricalIndex(10), Index(["foo", "bar", "baz"] * 2), diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 704028a7827cd..d15164bfeac64 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -369,15 +369,14 @@ def test_invalid_index_types(idx): @pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") -@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) -def test_invalid_index_types_unicode(idx): +def test_invalid_index_types_unicode(): # see gh-10822 # # Odd error message on conversions to datetime for unicode. msg = "Unknown string format" with pytest.raises(ValueError, match=msg): - frequencies.infer_freq(idx) + frequencies.infer_freq(tm.makeStringIndex(10)) def test_string_datetime_like_compat():