Skip to content

Commit eaa7789

Browse files
committed
restore
1 parent f6e87cd commit eaa7789

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+12972
-0
lines changed

pandas/tests/io/parser/__init__.py

Whitespace-only changes.

pandas/tests/io/parser/common/__init__.py

Whitespace-only changes.
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
"""
2+
Tests that work on both the Python and C engines but do not have a
3+
specific classification into the other test modules.
4+
"""
5+
from io import StringIO
6+
7+
import numpy as np
8+
import pytest
9+
10+
from pandas.errors import DtypeWarning
11+
12+
from pandas import (
13+
DataFrame,
14+
concat,
15+
)
16+
import pandas._testing as tm
17+
18+
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
19+
20+
21+
@pytest.mark.parametrize("index_col", [0, "index"])
22+
def test_read_chunksize_with_index(all_parsers, index_col):
23+
parser = all_parsers
24+
data = """index,A,B,C,D
25+
foo,2,3,4,5
26+
bar,7,8,9,10
27+
baz,12,13,14,15
28+
qux,12,13,14,15
29+
foo2,12,13,14,15
30+
bar2,12,13,14,15
31+
"""
32+
33+
expected = DataFrame(
34+
[
35+
["foo", 2, 3, 4, 5],
36+
["bar", 7, 8, 9, 10],
37+
["baz", 12, 13, 14, 15],
38+
["qux", 12, 13, 14, 15],
39+
["foo2", 12, 13, 14, 15],
40+
["bar2", 12, 13, 14, 15],
41+
],
42+
columns=["index", "A", "B", "C", "D"],
43+
)
44+
expected = expected.set_index("index")
45+
46+
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
47+
chunks = list(reader)
48+
tm.assert_frame_equal(chunks[0], expected[:2])
49+
tm.assert_frame_equal(chunks[1], expected[2:4])
50+
tm.assert_frame_equal(chunks[2], expected[4:])
51+
52+
53+
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
54+
def test_read_chunksize_bad(all_parsers, chunksize):
55+
data = """index,A,B,C,D
56+
foo,2,3,4,5
57+
bar,7,8,9,10
58+
baz,12,13,14,15
59+
qux,12,13,14,15
60+
foo2,12,13,14,15
61+
bar2,12,13,14,15
62+
"""
63+
parser = all_parsers
64+
msg = r"'chunksize' must be an integer >=1"
65+
66+
with pytest.raises(ValueError, match=msg):
67+
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
68+
pass
69+
70+
71+
@pytest.mark.parametrize("chunksize", [2, 8])
72+
def test_read_chunksize_and_nrows(all_parsers, chunksize):
73+
# see gh-15755
74+
data = """index,A,B,C,D
75+
foo,2,3,4,5
76+
bar,7,8,9,10
77+
baz,12,13,14,15
78+
qux,12,13,14,15
79+
foo2,12,13,14,15
80+
bar2,12,13,14,15
81+
"""
82+
parser = all_parsers
83+
kwargs = {"index_col": 0, "nrows": 5}
84+
85+
expected = parser.read_csv(StringIO(data), **kwargs)
86+
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
87+
tm.assert_frame_equal(concat(reader), expected)
88+
89+
90+
def test_read_chunksize_and_nrows_changing_size(all_parsers):
91+
data = """index,A,B,C,D
92+
foo,2,3,4,5
93+
bar,7,8,9,10
94+
baz,12,13,14,15
95+
qux,12,13,14,15
96+
foo2,12,13,14,15
97+
bar2,12,13,14,15
98+
"""
99+
parser = all_parsers
100+
kwargs = {"index_col": 0, "nrows": 5}
101+
102+
expected = parser.read_csv(StringIO(data), **kwargs)
103+
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
104+
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
105+
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
106+
107+
with pytest.raises(StopIteration, match=""):
108+
reader.get_chunk(size=3)
109+
110+
111+
def test_get_chunk_passed_chunksize(all_parsers):
112+
parser = all_parsers
113+
data = """A,B,C
114+
1,2,3
115+
4,5,6
116+
7,8,9
117+
1,2,3"""
118+
119+
with parser.read_csv(StringIO(data), chunksize=2) as reader:
120+
result = reader.get_chunk()
121+
122+
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
123+
tm.assert_frame_equal(result, expected)
124+
125+
126+
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
127+
def test_read_chunksize_compat(all_parsers, kwargs):
128+
# see gh-12185
129+
data = """index,A,B,C,D
130+
foo,2,3,4,5
131+
bar,7,8,9,10
132+
baz,12,13,14,15
133+
qux,12,13,14,15
134+
foo2,12,13,14,15
135+
bar2,12,13,14,15
136+
"""
137+
parser = all_parsers
138+
result = parser.read_csv(StringIO(data), **kwargs)
139+
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
140+
tm.assert_frame_equal(concat(reader), result)
141+
142+
143+
def test_read_chunksize_jagged_names(all_parsers):
144+
# see gh-23509
145+
parser = all_parsers
146+
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
147+
148+
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
149+
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
150+
result = concat(reader)
151+
tm.assert_frame_equal(result, expected)
152+
153+
154+
def test_chunk_begins_with_newline_whitespace(all_parsers):
155+
# see gh-10022
156+
parser = all_parsers
157+
data = "\n hello\nworld\n"
158+
159+
result = parser.read_csv(StringIO(data), header=None)
160+
expected = DataFrame([" hello", "world"])
161+
tm.assert_frame_equal(result, expected)
162+
163+
164+
@pytest.mark.slow
165+
def test_chunks_have_consistent_numerical_type(all_parsers):
166+
parser = all_parsers
167+
integers = [str(i) for i in range(499999)]
168+
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
169+
170+
# Coercions should work without warnings.
171+
with tm.assert_produces_warning(None):
172+
result = parser.read_csv(StringIO(data))
173+
174+
assert type(result.a[0]) is np.float64
175+
assert result.a.dtype == float
176+
177+
178+
def test_warn_if_chunks_have_mismatched_type(all_parsers):
179+
warning_type = None
180+
parser = all_parsers
181+
size = 10000
182+
183+
# see gh-3866: if chunks are different types and can't
184+
# be coerced using numerical types, then issue warning.
185+
if parser.engine == "c" and parser.low_memory:
186+
warning_type = DtypeWarning
187+
# Use larger size to hit warning path
188+
size = 499999
189+
190+
integers = [str(i) for i in range(size)]
191+
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
192+
193+
buf = StringIO(data)
194+
195+
with tm.assert_produces_warning(warning_type):
196+
df = parser.read_csv(buf)
197+
198+
assert df.a.dtype == object
199+
200+
201+
@pytest.mark.parametrize("iterator", [True, False])
202+
def test_empty_with_nrows_chunksize(all_parsers, iterator):
203+
# see gh-9535
204+
parser = all_parsers
205+
expected = DataFrame(columns=["foo", "bar"])
206+
207+
nrows = 10
208+
data = StringIO("foo,bar\n")
209+
210+
if iterator:
211+
with parser.read_csv(data, chunksize=nrows) as reader:
212+
result = next(iter(reader))
213+
else:
214+
result = parser.read_csv(data, nrows=nrows)
215+
216+
tm.assert_frame_equal(result, expected)
217+
218+
219+
def test_read_csv_memory_growth_chunksize(all_parsers):
220+
# see gh-24805
221+
#
222+
# Let's just make sure that we don't crash
223+
# as we iteratively process all chunks.
224+
parser = all_parsers
225+
226+
with tm.ensure_clean() as path:
227+
with open(path, "w") as f:
228+
for i in range(1000):
229+
f.write(str(i) + "\n")
230+
231+
with parser.read_csv(path, chunksize=20) as result:
232+
for _ in result:
233+
pass
234+
235+
236+
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
237+
# GH#21211
238+
parser = all_parsers
239+
data = """1,2,3,4
240+
5,6,7,8
241+
9,10,11
242+
"""
243+
244+
result_chunks = parser.read_csv(
245+
StringIO(data),
246+
names=["a", "b"],
247+
chunksize=2,
248+
usecols=[0, 1],
249+
header=None,
250+
)
251+
252+
expected_frames = [
253+
DataFrame({"a": [1, 5], "b": [2, 6]}),
254+
DataFrame({"a": [9], "b": [10]}, index=[2]),
255+
]
256+
257+
for i, result in enumerate(result_chunks):
258+
tm.assert_frame_equal(result, expected_frames[i])
259+
260+
261+
def test_chunksize_second_block_shorter(all_parsers):
262+
# GH#21211
263+
parser = all_parsers
264+
data = """a,b,c,d
265+
1,2,3,4
266+
5,6,7,8
267+
9,10,11
268+
"""
269+
270+
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
271+
272+
expected_frames = [
273+
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
274+
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
275+
]
276+
277+
for i, result in enumerate(result_chunks):
278+
tm.assert_frame_equal(result, expected_frames[i])

0 commit comments

Comments
 (0)