From fe2ecfb308a820486587e318048217c2be01e308 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 5 Jan 2020 17:40:22 -0800 Subject: [PATCH] REF: Create test_encoding file for CSV --- pandas/tests/io/parser/test_common.py | 146 +--------------------- pandas/tests/io/parser/test_encoding.py | 157 ++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 145 deletions(-) create mode 100644 pandas/tests/io/parser/test_encoding.py diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 0e408df625ccd..4c02a37b66455 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -5,7 +5,7 @@ import codecs import csv from datetime import datetime -from io import BytesIO, StringIO +from io import StringIO import os import platform from tempfile import TemporaryFile @@ -69,17 +69,6 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) -def test_bytes_io_input(all_parsers): - encoding = "cp1255" - parser = all_parsers - - data = BytesIO("שלום:1234\n562:123".encode(encoding)) - result = parser.read_csv(data, sep=":", encoding=encoding) - - expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - def test_empty_decimal_marker(all_parsers): data = """A|B|C 1|2,334|5 @@ -316,15 +305,6 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_unicode(all_parsers): - parser = all_parsers - data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) - - result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) - expected = DataFrame([["\u0141aski, Jan", 1]]) - tm.assert_frame_equal(result, expected) - - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. data = """A,B,C,D,E,F @@ -1064,59 +1044,6 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep", [",", "\t"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) -def test_utf16_bom_skiprows(all_parsers, sep, encoding): - # see gh-2298 - parser = all_parsers - data = """skip this -skip this too -A,B,C -1,2,3 -4,5,6""".replace( - ",", sep - ) - path = "__{}__.csv".format(tm.rands(10)) - kwargs = dict(sep=sep, skiprows=2) - utf8 = "utf-8" - - with tm.ensure_clean(path) as path: - from io import TextIOWrapper - - bytes_data = data.encode(encoding) - - with open(path, "wb") as f: - f.write(bytes_data) - - bytes_buffer = BytesIO(data.encode(utf8)) - bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) - - result = parser.read_csv(path, encoding=encoding, **kwargs) - expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) - - bytes_buffer.close() - tm.assert_frame_equal(result, expected) - - -def test_utf16_example(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - result = parser.read_csv(path, encoding="utf-16", sep="\t") - assert len(result) == 50 - - -def test_unicode_encoding(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - - result = parser.read_csv(path, header=None, encoding="latin-1") - result = result.set_index(0) - got = result[1][1632] - - expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" - assert got == expected - - def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1915,39 +1842,6 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # Basic test - ("a\n1", dict(), DataFrame({"a": [1]})), - # "Regular" quoting - ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), - # Test in a data row instead of header - ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), - # Test in empty data row with skipping - ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), - # Test in empty data row without skipping - ( - "\n1", - dict(names=["a"], skip_blank_lines=False), - DataFrame({"a": [np.nan, 1]}), - ), - ], -) -def test_utf8_bom(all_parsers, data, kwargs, expected): - # see gh-4793 - parser = all_parsers - bom = "\ufeff" - utf8 = "utf-8" - - def _encode_data_with_bom(_data): - bom_data = (bom + _data).encode(utf8) - return BytesIO(bom_data) - - result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) - tm.assert_frame_equal(result, expected) - - def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -1965,20 +1859,6 @@ def test_temporary_file(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) -def test_read_csv_utf_aliases(all_parsers, byte, fmt): - # see gh-13549 - expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) - parser = all_parsers - - encoding = fmt.format(byte) - data = "mb_num,multibyte\n4.8,test".encode(encoding) - - result = parser.read_csv(BytesIO(data), encoding=encoding) - tm.assert_frame_equal(result, expected) - - def test_internal_eof_byte(all_parsers): # see gh-5500 parser = all_parsers @@ -2038,30 +1918,6 @@ def test_file_handles_with_open(all_parsers, csv1): assert not f.closed -@pytest.mark.parametrize( - "fname,encoding", - [ - ("test1.csv", "utf-8"), - ("unicode_series.csv", "latin-1"), - ("sauron.SHIFT_JIS.csv", "shiftjis"), - ], -) -def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): - # gh-23779: Python csv engine shouldn't error on files opened in binary. - parser = all_parsers - - fpath = os.path.join(csv_dir_path, fname) - expected = parser.read_csv(fpath, encoding=encoding) - - with open(fpath, mode="r", encoding=encoding) as fa: - result = parser.read_csv(fa) - tm.assert_frame_equal(expected, result) - - with open(fpath, mode="rb") as fb: - result = parser.read_csv(fb, encoding=encoding) - tm.assert_frame_equal(expected, result) - - def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py new file mode 100644 index 0000000000000..2540dd9b19fce --- /dev/null +++ b/pandas/tests/io/parser/test_encoding.py @@ -0,0 +1,157 @@ +""" +Tests encoding functionality during parsing +for all of the parsers defined in parsers.py +""" + +from io import BytesIO +import os + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_bytes_io_input(all_parsers): + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([["\u0141aski, Jan", 1]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = """skip this +skip this too +A,B,C +1,2,3 +4,5,6""".replace( + ",", sep + ) + path = "__{}__.csv".format(tm.rands(10)) + kwargs = dict(sep=sep, skiprows=2) + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + from io import TextIOWrapper + + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + bytes_buffer = BytesIO(data.encode(utf8)) + bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) + + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + + bytes_buffer.close() + tm.assert_frame_equal(result, expected) + + +def test_utf16_example(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + result = parser.read_csv(path, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" + assert got == expected + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) +def test_utf8_bom(all_parsers, data, kwargs, expected): + # see gh-4793 + parser = all_parsers + bom = "\ufeff" + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("byte", [8, 16]) +@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) +def test_read_csv_utf_aliases(all_parsers, byte, fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = fmt.format(byte) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result)