Skip to content

Commit eac5129

Browse files
authored
BUG: read_csv and read_fwf not skipping all defined rows when nrows is given (#44434)
1 parent fa28c61 commit eac5129

File tree

4 files changed

+58
-16
lines changed

4 files changed

+58
-16
lines changed

doc/source/whatsnew/v1.4.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,7 @@ I/O
635635
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
636636
- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
637637
- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
638+
- Bug in :func:`read_csv` and :func:`read_fwf` ignoring all ``skiprows`` except first when ``nrows`` is specified for ``engine='python'`` (:issue:`44021`)
638639
- Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`)
639640
- Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`)
640641
- Bug in :func:`json_normalize` where reading data with missing multi-level metadata would not respect errors="ignore" (:issue:`44312`)

pandas/io/parsers/python_parser.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
import numpy as np
2020

2121
import pandas._libs.lib as lib
22-
from pandas._typing import FilePathOrBuffer
22+
from pandas._typing import (
23+
FilePathOrBuffer,
24+
Scalar,
25+
)
2326
from pandas.errors import (
2427
EmptyDataError,
2528
ParserError,
@@ -1020,26 +1023,29 @@ def _get_lines(self, rows=None):
10201023
new_rows = self.data[self.pos : self.pos + rows]
10211024
new_pos = self.pos + rows
10221025

1023-
# Check for stop rows. n.b.: self.skiprows is a set.
1024-
if self.skiprows:
1025-
new_rows = [
1026-
row
1027-
for i, row in enumerate(new_rows)
1028-
if not self.skipfunc(i + self.pos)
1029-
]
1030-
1026+
new_rows = self._remove_skipped_rows(new_rows)
10311027
lines.extend(new_rows)
10321028
self.pos = new_pos
10331029

10341030
else:
10351031
new_rows = []
10361032
try:
10371033
if rows is not None:
1038-
for _ in range(rows):
1034+
1035+
rows_to_skip = 0
1036+
if self.skiprows is not None and self.pos is not None:
1037+
# Only read additional rows if pos is in skiprows
1038+
rows_to_skip = len(
1039+
set(self.skiprows) - set(range(self.pos))
1040+
)
1041+
1042+
for _ in range(rows + rows_to_skip):
10391043
# assert for mypy, data is Iterator[str] or None, would
10401044
# error in next
10411045
assert self.data is not None
10421046
new_rows.append(next(self.data))
1047+
1048+
new_rows = self._remove_skipped_rows(new_rows)
10431049
lines.extend(new_rows)
10441050
else:
10451051
rows = 0
@@ -1052,12 +1058,7 @@ def _get_lines(self, rows=None):
10521058
new_rows.append(new_row)
10531059

10541060
except StopIteration:
1055-
if self.skiprows:
1056-
new_rows = [
1057-
row
1058-
for i, row in enumerate(new_rows)
1059-
if not self.skipfunc(i + self.pos)
1060-
]
1061+
new_rows = self._remove_skipped_rows(new_rows)
10611062
lines.extend(new_rows)
10621063
if len(lines) == 0:
10631064
raise
@@ -1076,6 +1077,13 @@ def _get_lines(self, rows=None):
10761077
lines = self._check_thousands(lines)
10771078
return self._check_decimal(lines)
10781079

1080+
def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]:
1081+
if self.skiprows:
1082+
return [
1083+
row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)
1084+
]
1085+
return new_rows
1086+
10791087

10801088
class FixedWidthReader(abc.Iterator):
10811089
"""

pandas/tests/io/parser/test_read_fwf.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,3 +862,18 @@ def test_colspecs_with_comment():
862862
)
863863
expected = DataFrame([[1, "K"]], columns=[0, 1])
864864
tm.assert_frame_equal(result, expected)
865+
866+
867+
def test_skip_rows_and_n_rows():
868+
# GH#44021
869+
data = """a\tb
870+
1\t a
871+
2\t b
872+
3\t c
873+
4\t d
874+
5\t e
875+
6\t f
876+
"""
877+
result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4])
878+
expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]})
879+
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/test_skiprows.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,21 @@ def test_skip_rows_bad_callable(all_parsers):
256256

257257
with pytest.raises(ZeroDivisionError, match=msg):
258258
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
259+
260+
261+
def test_skip_rows_and_n_rows(all_parsers):
262+
# GH#44021
263+
data = """a,b
264+
1,a
265+
2,b
266+
3,c
267+
4,d
268+
5,e
269+
6,f
270+
7,g
271+
8,h
272+
"""
273+
parser = all_parsers
274+
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
275+
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
276+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)