Skip to content

Commit a09b1ef

Browse files
committed
REF: Create test_encoding file for CSV
1 parent 2a3d840 commit a09b1ef

File tree

2 files changed

+158
-144
lines changed

2 files changed

+158
-144
lines changed

pandas/tests/io/parser/test_common.py

Lines changed: 0 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,6 @@ def _set_noconvert_columns(self):
6969
tm.assert_frame_equal(result, expected)
7070

7171

72-
def test_bytes_io_input(all_parsers):
73-
encoding = "cp1255"
74-
parser = all_parsers
75-
76-
data = BytesIO("שלום:1234\n562:123".encode(encoding))
77-
result = parser.read_csv(data, sep=":", encoding=encoding)
78-
79-
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
80-
tm.assert_frame_equal(result, expected)
81-
82-
8372
def test_empty_decimal_marker(all_parsers):
8473
data = """A|B|C
8574
1|2,334|5
@@ -316,15 +305,6 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path):
316305
tm.assert_frame_equal(result, expected)
317306

318307

319-
def test_read_csv_unicode(all_parsers):
320-
parser = all_parsers
321-
data = BytesIO("\u0141aski, Jan;1".encode("utf-8"))
322-
323-
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
324-
expected = DataFrame([["\u0141aski, Jan", 1]])
325-
tm.assert_frame_equal(result, expected)
326-
327-
328308
def test_read_csv_wrong_num_columns(all_parsers):
329309
# Too few columns.
330310
data = """A,B,C,D,E,F
@@ -1064,59 +1044,6 @@ def test_skip_initial_space(all_parsers):
10641044
tm.assert_frame_equal(result, expected)
10651045

10661046

1067-
@pytest.mark.parametrize("sep", [",", "\t"])
1068-
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
1069-
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
1070-
# see gh-2298
1071-
parser = all_parsers
1072-
data = """skip this
1073-
skip this too
1074-
A,B,C
1075-
1,2,3
1076-
4,5,6""".replace(
1077-
",", sep
1078-
)
1079-
path = "__{}__.csv".format(tm.rands(10))
1080-
kwargs = dict(sep=sep, skiprows=2)
1081-
utf8 = "utf-8"
1082-
1083-
with tm.ensure_clean(path) as path:
1084-
from io import TextIOWrapper
1085-
1086-
bytes_data = data.encode(encoding)
1087-
1088-
with open(path, "wb") as f:
1089-
f.write(bytes_data)
1090-
1091-
bytes_buffer = BytesIO(data.encode(utf8))
1092-
bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)
1093-
1094-
result = parser.read_csv(path, encoding=encoding, **kwargs)
1095-
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
1096-
1097-
bytes_buffer.close()
1098-
tm.assert_frame_equal(result, expected)
1099-
1100-
1101-
def test_utf16_example(all_parsers, csv_dir_path):
1102-
path = os.path.join(csv_dir_path, "utf16_ex.txt")
1103-
parser = all_parsers
1104-
result = parser.read_csv(path, encoding="utf-16", sep="\t")
1105-
assert len(result) == 50
1106-
1107-
1108-
def test_unicode_encoding(all_parsers, csv_dir_path):
1109-
path = os.path.join(csv_dir_path, "unicode_series.csv")
1110-
parser = all_parsers
1111-
1112-
result = parser.read_csv(path, header=None, encoding="latin-1")
1113-
result = result.set_index(0)
1114-
got = result[1][1632]
1115-
1116-
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
1117-
assert got == expected
1118-
1119-
11201047
def test_trailing_delimiters(all_parsers):
11211048
# see gh-2442
11221049
data = """A,B,C
@@ -1915,39 +1842,6 @@ def test_null_byte_char(all_parsers):
19151842
parser.read_csv(StringIO(data), names=names)
19161843

19171844

1918-
@pytest.mark.parametrize(
1919-
"data,kwargs,expected",
1920-
[
1921-
# Basic test
1922-
("a\n1", dict(), DataFrame({"a": [1]})),
1923-
# "Regular" quoting
1924-
('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})),
1925-
# Test in a data row instead of header
1926-
("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})),
1927-
# Test in empty data row with skipping
1928-
("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})),
1929-
# Test in empty data row without skipping
1930-
(
1931-
"\n1",
1932-
dict(names=["a"], skip_blank_lines=False),
1933-
DataFrame({"a": [np.nan, 1]}),
1934-
),
1935-
],
1936-
)
1937-
def test_utf8_bom(all_parsers, data, kwargs, expected):
1938-
# see gh-4793
1939-
parser = all_parsers
1940-
bom = "\ufeff"
1941-
utf8 = "utf-8"
1942-
1943-
def _encode_data_with_bom(_data):
1944-
bom_data = (bom + _data).encode(utf8)
1945-
return BytesIO(bom_data)
1946-
1947-
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
1948-
tm.assert_frame_equal(result, expected)
1949-
1950-
19511845
def test_temporary_file(all_parsers):
19521846
# see gh-13398
19531847
parser = all_parsers
@@ -1965,20 +1859,6 @@ def test_temporary_file(all_parsers):
19651859
tm.assert_frame_equal(result, expected)
19661860

19671861

1968-
@pytest.mark.parametrize("byte", [8, 16])
1969-
@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
1970-
def test_read_csv_utf_aliases(all_parsers, byte, fmt):
1971-
# see gh-13549
1972-
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
1973-
parser = all_parsers
1974-
1975-
encoding = fmt.format(byte)
1976-
data = "mb_num,multibyte\n4.8,test".encode(encoding)
1977-
1978-
result = parser.read_csv(BytesIO(data), encoding=encoding)
1979-
tm.assert_frame_equal(result, expected)
1980-
1981-
19821862
def test_internal_eof_byte(all_parsers):
19831863
# see gh-5500
19841864
parser = all_parsers
@@ -2038,30 +1918,6 @@ def test_file_handles_with_open(all_parsers, csv1):
20381918
assert not f.closed
20391919

20401920

2041-
@pytest.mark.parametrize(
2042-
"fname,encoding",
2043-
[
2044-
("test1.csv", "utf-8"),
2045-
("unicode_series.csv", "latin-1"),
2046-
("sauron.SHIFT_JIS.csv", "shiftjis"),
2047-
],
2048-
)
2049-
def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
2050-
# gh-23779: Python csv engine shouldn't error on files opened in binary.
2051-
parser = all_parsers
2052-
2053-
fpath = os.path.join(csv_dir_path, fname)
2054-
expected = parser.read_csv(fpath, encoding=encoding)
2055-
2056-
with open(fpath, mode="r", encoding=encoding) as fa:
2057-
result = parser.read_csv(fa)
2058-
tm.assert_frame_equal(expected, result)
2059-
2060-
with open(fpath, mode="rb") as fb:
2061-
result = parser.read_csv(fb, encoding=encoding)
2062-
tm.assert_frame_equal(expected, result)
2063-
2064-
20651921
def test_invalid_file_buffer_class(all_parsers):
20661922
# see gh-15337
20671923
class InvalidBuffer:
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""
2+
Tests encoding functionality during parsing
3+
for all of the parsers defined in parsers.py
4+
"""
5+
6+
from io import BytesIO
7+
8+
from pandas import DataFrame
9+
10+
import os
11+
import pytest
12+
13+
import numpy as np
14+
import pandas._testing as tm
15+
16+
17+
def test_bytes_io_input(all_parsers):
18+
encoding = "cp1255"
19+
parser = all_parsers
20+
21+
data = BytesIO("שלום:1234\n562:123".encode(encoding))
22+
result = parser.read_csv(data, sep=":", encoding=encoding)
23+
24+
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
25+
tm.assert_frame_equal(result, expected)
26+
27+
28+
def test_read_csv_unicode(all_parsers):
29+
parser = all_parsers
30+
data = BytesIO("\u0141aski, Jan;1".encode("utf-8"))
31+
32+
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
33+
expected = DataFrame([["\u0141aski, Jan", 1]])
34+
tm.assert_frame_equal(result, expected)
35+
36+
37+
@pytest.mark.parametrize("sep", [",", "\t"])
38+
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
39+
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
40+
# see gh-2298
41+
parser = all_parsers
42+
data = """skip this
43+
skip this too
44+
A,B,C
45+
1,2,3
46+
4,5,6""".replace(
47+
",", sep
48+
)
49+
path = "__{}__.csv".format(tm.rands(10))
50+
kwargs = dict(sep=sep, skiprows=2)
51+
utf8 = "utf-8"
52+
53+
with tm.ensure_clean(path) as path:
54+
from io import TextIOWrapper
55+
56+
bytes_data = data.encode(encoding)
57+
58+
with open(path, "wb") as f:
59+
f.write(bytes_data)
60+
61+
bytes_buffer = BytesIO(data.encode(utf8))
62+
bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)
63+
64+
result = parser.read_csv(path, encoding=encoding, **kwargs)
65+
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
66+
67+
bytes_buffer.close()
68+
tm.assert_frame_equal(result, expected)
69+
70+
71+
def test_utf16_example(all_parsers, csv_dir_path):
72+
path = os.path.join(csv_dir_path, "utf16_ex.txt")
73+
parser = all_parsers
74+
result = parser.read_csv(path, encoding="utf-16", sep="\t")
75+
assert len(result) == 50
76+
77+
78+
def test_unicode_encoding(all_parsers, csv_dir_path):
79+
path = os.path.join(csv_dir_path, "unicode_series.csv")
80+
parser = all_parsers
81+
82+
result = parser.read_csv(path, header=None, encoding="latin-1")
83+
result = result.set_index(0)
84+
got = result[1][1632]
85+
86+
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
87+
assert got == expected
88+
89+
90+
@pytest.mark.parametrize(
91+
"data,kwargs,expected",
92+
[
93+
# Basic test
94+
("a\n1", dict(), DataFrame({"a": [1]})),
95+
# "Regular" quoting
96+
('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})),
97+
# Test in a data row instead of header
98+
("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})),
99+
# Test in empty data row with skipping
100+
("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})),
101+
# Test in empty data row without skipping
102+
(
103+
"\n1",
104+
dict(names=["a"], skip_blank_lines=False),
105+
DataFrame({"a": [np.nan, 1]}),
106+
),
107+
],
108+
)
109+
def test_utf8_bom(all_parsers, data, kwargs, expected):
110+
# see gh-4793
111+
parser = all_parsers
112+
bom = "\ufeff"
113+
utf8 = "utf-8"
114+
115+
def _encode_data_with_bom(_data):
116+
bom_data = (bom + _data).encode(utf8)
117+
return BytesIO(bom_data)
118+
119+
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
120+
tm.assert_frame_equal(result, expected)
121+
122+
123+
@pytest.mark.parametrize("byte", [8, 16])
124+
@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
125+
def test_read_csv_utf_aliases(all_parsers, byte, fmt):
126+
# see gh-13549
127+
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
128+
parser = all_parsers
129+
130+
encoding = fmt.format(byte)
131+
data = "mb_num,multibyte\n4.8,test".encode(encoding)
132+
133+
result = parser.read_csv(BytesIO(data), encoding=encoding)
134+
tm.assert_frame_equal(result, expected)
135+
136+
137+
@pytest.mark.parametrize(
138+
"fname,encoding",
139+
[
140+
("test1.csv", "utf-8"),
141+
("unicode_series.csv", "latin-1"),
142+
("sauron.SHIFT_JIS.csv", "shiftjis"),
143+
],
144+
)
145+
def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
146+
# gh-23779: Python csv engine shouldn't error on files opened in binary.
147+
parser = all_parsers
148+
149+
fpath = os.path.join(csv_dir_path, fname)
150+
expected = parser.read_csv(fpath, encoding=encoding)
151+
152+
with open(fpath, mode="r", encoding=encoding) as fa:
153+
result = parser.read_csv(fa)
154+
tm.assert_frame_equal(expected, result)
155+
156+
with open(fpath, mode="rb") as fb:
157+
result = parser.read_csv(fb, encoding=encoding)
158+
tm.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)