diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index f23b498c7388a..876696ecdad9c 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas import DataFrame +from pandas import DataFrame, read_csv import pandas._testing as tm @@ -199,3 +199,17 @@ def test_encoding_named_temp_file(all_parsers): result = parser.read_csv(f, encoding=encoding) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"] +) +def test_parse_encoded_special_characters(encoding): + # GH16218 Verify parsing of data with encoded special characters + # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a") + data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" + encoded_data = BytesIO(data.encode(encoding)) + result = read_csv(encoded_data, delimiter="\t", encoding=encoding) + + expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected)