From 6aac1dfedf4099790620537b62954da5438da43c Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Sun, 4 Oct 2020 00:13:38 +0200 Subject: [PATCH 1/2] TST: Verify parsing of data with encoded special characters --- pandas/tests/io/test_common.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ede8d61490778..433aefb5c49a3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,7 +1,7 @@ """ Tests for the pandas.io.common functionalities """ -from io import StringIO +from io import BytesIO, StringIO import mmap import os from pathlib import Path @@ -417,3 +417,19 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + + +@pytest.mark.parametrize( + "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"] +) +def test_parse_encoded_special_characters(encoding): + # GH16218 Verify parsing of data with encoded special characters + # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a") + data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" + encoded_data = BytesIO(data.encode(encoding)) + result = pd.read_csv(encoded_data, delimiter="\t", encoding=encoding) + + expected = pd.DataFrame( + data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"] + ) + tm.assert_frame_equal(result, expected) From 5171472d1f289161a1999e236c37e68e1475c9ec Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Sun, 4 Oct 2020 13:13:35 +0200 Subject: [PATCH 2/2] Move --- pandas/tests/io/parser/test_encoding.py | 16 +++++++++++++++- pandas/tests/io/test_common.py | 18 +----------------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index f23b498c7388a..876696ecdad9c 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas import DataFrame +from pandas import DataFrame, read_csv import pandas._testing as tm @@ -199,3 +199,17 @@ def test_encoding_named_temp_file(all_parsers): result = parser.read_csv(f, encoding=encoding) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"] +) +def test_parse_encoded_special_characters(encoding): + # GH16218 Verify parsing of data with encoded special characters + # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a") + data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" + encoded_data = BytesIO(data.encode(encoding)) + result = read_csv(encoded_data, delimiter="\t", encoding=encoding) + + expected = DataFrame(data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 433aefb5c49a3..ede8d61490778 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,7 +1,7 @@ """ Tests for the pandas.io.common functionalities """ -from io import BytesIO, StringIO +from io import StringIO import mmap import os from pathlib import Path @@ -417,19 +417,3 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") - - -@pytest.mark.parametrize( - "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"] -) -def test_parse_encoded_special_characters(encoding): - # GH16218 Verify parsing of data with encoded special characters - # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a") - data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" - encoded_data = BytesIO(data.encode(encoding)) - result = pd.read_csv(encoded_data, delimiter="\t", encoding=encoding) - - expected = pd.DataFrame( - data=[[":foo", 0], ["bar", 1], ["baz", 2]], columns=["a", "b"] - ) - tm.assert_frame_equal(result, expected)