Skip to content

Commit 1c9ebd7

Browse files
gfyoungjreback
authored andcommitted
REF: Create test_encoding file for CSV (#30723)
1 parent 8cff0e1 commit 1c9ebd7

File tree

2 files changed

+158
-145
lines changed

2 files changed

+158
-145
lines changed

pandas/tests/io/parser/test_common.py

Lines changed: 1 addition & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import codecs
66
import csv
77
from datetime import datetime
8-
from io import BytesIO, StringIO
8+
from io import StringIO
99
import os
1010
import platform
1111
from tempfile import TemporaryFile
@@ -69,17 +69,6 @@ def _set_noconvert_columns(self):
6969
tm.assert_frame_equal(result, expected)
7070

7171

72-
def test_bytes_io_input(all_parsers):
73-
encoding = "cp1255"
74-
parser = all_parsers
75-
76-
data = BytesIO("שלום:1234\n562:123".encode(encoding))
77-
result = parser.read_csv(data, sep=":", encoding=encoding)
78-
79-
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
80-
tm.assert_frame_equal(result, expected)
81-
82-
8372
def test_empty_decimal_marker(all_parsers):
8473
data = """A|B|C
8574
1|2,334|5
@@ -316,15 +305,6 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path):
316305
tm.assert_frame_equal(result, expected)
317306

318307

319-
def test_read_csv_unicode(all_parsers):
320-
parser = all_parsers
321-
data = BytesIO("\u0141aski, Jan;1".encode("utf-8"))
322-
323-
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
324-
expected = DataFrame([["\u0141aski, Jan", 1]])
325-
tm.assert_frame_equal(result, expected)
326-
327-
328308
def test_read_csv_wrong_num_columns(all_parsers):
329309
# Too few columns.
330310
data = """A,B,C,D,E,F
@@ -1064,59 +1044,6 @@ def test_skip_initial_space(all_parsers):
10641044
tm.assert_frame_equal(result, expected)
10651045

10661046

1067-
@pytest.mark.parametrize("sep", [",", "\t"])
1068-
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
1069-
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
1070-
# see gh-2298
1071-
parser = all_parsers
1072-
data = """skip this
1073-
skip this too
1074-
A,B,C
1075-
1,2,3
1076-
4,5,6""".replace(
1077-
",", sep
1078-
)
1079-
path = "__{}__.csv".format(tm.rands(10))
1080-
kwargs = dict(sep=sep, skiprows=2)
1081-
utf8 = "utf-8"
1082-
1083-
with tm.ensure_clean(path) as path:
1084-
from io import TextIOWrapper
1085-
1086-
bytes_data = data.encode(encoding)
1087-
1088-
with open(path, "wb") as f:
1089-
f.write(bytes_data)
1090-
1091-
bytes_buffer = BytesIO(data.encode(utf8))
1092-
bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)
1093-
1094-
result = parser.read_csv(path, encoding=encoding, **kwargs)
1095-
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
1096-
1097-
bytes_buffer.close()
1098-
tm.assert_frame_equal(result, expected)
1099-
1100-
1101-
def test_utf16_example(all_parsers, csv_dir_path):
1102-
path = os.path.join(csv_dir_path, "utf16_ex.txt")
1103-
parser = all_parsers
1104-
result = parser.read_csv(path, encoding="utf-16", sep="\t")
1105-
assert len(result) == 50
1106-
1107-
1108-
def test_unicode_encoding(all_parsers, csv_dir_path):
1109-
path = os.path.join(csv_dir_path, "unicode_series.csv")
1110-
parser = all_parsers
1111-
1112-
result = parser.read_csv(path, header=None, encoding="latin-1")
1113-
result = result.set_index(0)
1114-
got = result[1][1632]
1115-
1116-
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
1117-
assert got == expected
1118-
1119-
11201047
def test_trailing_delimiters(all_parsers):
11211048
# see gh-2442
11221049
data = """A,B,C
@@ -1915,39 +1842,6 @@ def test_null_byte_char(all_parsers):
19151842
parser.read_csv(StringIO(data), names=names)
19161843

19171844

1918-
@pytest.mark.parametrize(
1919-
"data,kwargs,expected",
1920-
[
1921-
# Basic test
1922-
("a\n1", dict(), DataFrame({"a": [1]})),
1923-
# "Regular" quoting
1924-
('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})),
1925-
# Test in a data row instead of header
1926-
("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})),
1927-
# Test in empty data row with skipping
1928-
("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})),
1929-
# Test in empty data row without skipping
1930-
(
1931-
"\n1",
1932-
dict(names=["a"], skip_blank_lines=False),
1933-
DataFrame({"a": [np.nan, 1]}),
1934-
),
1935-
],
1936-
)
1937-
def test_utf8_bom(all_parsers, data, kwargs, expected):
1938-
# see gh-4793
1939-
parser = all_parsers
1940-
bom = "\ufeff"
1941-
utf8 = "utf-8"
1942-
1943-
def _encode_data_with_bom(_data):
1944-
bom_data = (bom + _data).encode(utf8)
1945-
return BytesIO(bom_data)
1946-
1947-
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
1948-
tm.assert_frame_equal(result, expected)
1949-
1950-
19511845
def test_temporary_file(all_parsers):
19521846
# see gh-13398
19531847
parser = all_parsers
@@ -1965,20 +1859,6 @@ def test_temporary_file(all_parsers):
19651859
tm.assert_frame_equal(result, expected)
19661860

19671861

1968-
@pytest.mark.parametrize("byte", [8, 16])
1969-
@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
1970-
def test_read_csv_utf_aliases(all_parsers, byte, fmt):
1971-
# see gh-13549
1972-
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
1973-
parser = all_parsers
1974-
1975-
encoding = fmt.format(byte)
1976-
data = "mb_num,multibyte\n4.8,test".encode(encoding)
1977-
1978-
result = parser.read_csv(BytesIO(data), encoding=encoding)
1979-
tm.assert_frame_equal(result, expected)
1980-
1981-
19821862
def test_internal_eof_byte(all_parsers):
19831863
# see gh-5500
19841864
parser = all_parsers
@@ -2038,30 +1918,6 @@ def test_file_handles_with_open(all_parsers, csv1):
20381918
assert not f.closed
20391919

20401920

2041-
@pytest.mark.parametrize(
2042-
"fname,encoding",
2043-
[
2044-
("test1.csv", "utf-8"),
2045-
("unicode_series.csv", "latin-1"),
2046-
("sauron.SHIFT_JIS.csv", "shiftjis"),
2047-
],
2048-
)
2049-
def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
2050-
# gh-23779: Python csv engine shouldn't error on files opened in binary.
2051-
parser = all_parsers
2052-
2053-
fpath = os.path.join(csv_dir_path, fname)
2054-
expected = parser.read_csv(fpath, encoding=encoding)
2055-
2056-
with open(fpath, mode="r", encoding=encoding) as fa:
2057-
result = parser.read_csv(fa)
2058-
tm.assert_frame_equal(expected, result)
2059-
2060-
with open(fpath, mode="rb") as fb:
2061-
result = parser.read_csv(fb, encoding=encoding)
2062-
tm.assert_frame_equal(expected, result)
2063-
2064-
20651921
def test_invalid_file_buffer_class(all_parsers):
20661922
# see gh-15337
20671923
class InvalidBuffer:
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""
2+
Tests encoding functionality during parsing
3+
for all of the parsers defined in parsers.py
4+
"""
5+
6+
from io import BytesIO
7+
import os
8+
9+
import numpy as np
10+
import pytest
11+
12+
from pandas import DataFrame
13+
import pandas._testing as tm
14+
15+
16+
def test_bytes_io_input(all_parsers):
17+
encoding = "cp1255"
18+
parser = all_parsers
19+
20+
data = BytesIO("שלום:1234\n562:123".encode(encoding))
21+
result = parser.read_csv(data, sep=":", encoding=encoding)
22+
23+
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
24+
tm.assert_frame_equal(result, expected)
25+
26+
27+
def test_read_csv_unicode(all_parsers):
28+
parser = all_parsers
29+
data = BytesIO("\u0141aski, Jan;1".encode("utf-8"))
30+
31+
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
32+
expected = DataFrame([["\u0141aski, Jan", 1]])
33+
tm.assert_frame_equal(result, expected)
34+
35+
36+
@pytest.mark.parametrize("sep", [",", "\t"])
37+
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
38+
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
39+
# see gh-2298
40+
parser = all_parsers
41+
data = """skip this
42+
skip this too
43+
A,B,C
44+
1,2,3
45+
4,5,6""".replace(
46+
",", sep
47+
)
48+
path = "__{}__.csv".format(tm.rands(10))
49+
kwargs = dict(sep=sep, skiprows=2)
50+
utf8 = "utf-8"
51+
52+
with tm.ensure_clean(path) as path:
53+
from io import TextIOWrapper
54+
55+
bytes_data = data.encode(encoding)
56+
57+
with open(path, "wb") as f:
58+
f.write(bytes_data)
59+
60+
bytes_buffer = BytesIO(data.encode(utf8))
61+
bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)
62+
63+
result = parser.read_csv(path, encoding=encoding, **kwargs)
64+
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
65+
66+
bytes_buffer.close()
67+
tm.assert_frame_equal(result, expected)
68+
69+
70+
def test_utf16_example(all_parsers, csv_dir_path):
71+
path = os.path.join(csv_dir_path, "utf16_ex.txt")
72+
parser = all_parsers
73+
result = parser.read_csv(path, encoding="utf-16", sep="\t")
74+
assert len(result) == 50
75+
76+
77+
def test_unicode_encoding(all_parsers, csv_dir_path):
78+
path = os.path.join(csv_dir_path, "unicode_series.csv")
79+
parser = all_parsers
80+
81+
result = parser.read_csv(path, header=None, encoding="latin-1")
82+
result = result.set_index(0)
83+
got = result[1][1632]
84+
85+
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
86+
assert got == expected
87+
88+
89+
@pytest.mark.parametrize(
90+
"data,kwargs,expected",
91+
[
92+
# Basic test
93+
("a\n1", dict(), DataFrame({"a": [1]})),
94+
# "Regular" quoting
95+
('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})),
96+
# Test in a data row instead of header
97+
("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})),
98+
# Test in empty data row with skipping
99+
("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})),
100+
# Test in empty data row without skipping
101+
(
102+
"\n1",
103+
dict(names=["a"], skip_blank_lines=False),
104+
DataFrame({"a": [np.nan, 1]}),
105+
),
106+
],
107+
)
108+
def test_utf8_bom(all_parsers, data, kwargs, expected):
109+
# see gh-4793
110+
parser = all_parsers
111+
bom = "\ufeff"
112+
utf8 = "utf-8"
113+
114+
def _encode_data_with_bom(_data):
115+
bom_data = (bom + _data).encode(utf8)
116+
return BytesIO(bom_data)
117+
118+
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
119+
tm.assert_frame_equal(result, expected)
120+
121+
122+
@pytest.mark.parametrize("byte", [8, 16])
123+
@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
124+
def test_read_csv_utf_aliases(all_parsers, byte, fmt):
125+
# see gh-13549
126+
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
127+
parser = all_parsers
128+
129+
encoding = fmt.format(byte)
130+
data = "mb_num,multibyte\n4.8,test".encode(encoding)
131+
132+
result = parser.read_csv(BytesIO(data), encoding=encoding)
133+
tm.assert_frame_equal(result, expected)
134+
135+
136+
@pytest.mark.parametrize(
137+
"fname,encoding",
138+
[
139+
("test1.csv", "utf-8"),
140+
("unicode_series.csv", "latin-1"),
141+
("sauron.SHIFT_JIS.csv", "shiftjis"),
142+
],
143+
)
144+
def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
145+
# gh-23779: Python csv engine shouldn't error on files opened in binary.
146+
parser = all_parsers
147+
148+
fpath = os.path.join(csv_dir_path, fname)
149+
expected = parser.read_csv(fpath, encoding=encoding)
150+
151+
with open(fpath, mode="r", encoding=encoding) as fa:
152+
result = parser.read_csv(fa)
153+
tm.assert_frame_equal(expected, result)
154+
155+
with open(fpath, mode="rb") as fb:
156+
result = parser.read_csv(fb, encoding=encoding)
157+
tm.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)