From 94746b1087d919542526af303e39401ce9d790c4 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 13 Nov 2021 22:33:11 +0100 Subject: [PATCH 1/2] BUG: read_csv and read_fwf not skipping all defined rows when nrows is given --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/python_parser.py | 40 +++++++++++++++---------- pandas/tests/io/parser/test_read_fwf.py | 15 ++++++++++ pandas/tests/io/parser/test_skiprows.py | 18 +++++++++++ 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 59b164c156d79..e6e1d5bd18b4c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -629,6 +629,7 @@ I/O - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) +- Bug in :func:`read_csv` and :func:`read_fwf` ignoring all ``skiprows`` except first when ``nrows`` is specified for ``engine='python'`` (:issue:`44021`) - Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 4d596aa2f3fa6..008f0fc861df6 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -19,7 +19,10 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePathOrBuffer, + Scalar, +) from pandas.errors import ( EmptyDataError, ParserError, @@ -1020,14 +1023,7 @@ def _get_lines(self, rows=None): new_rows = self.data[self.pos : self.pos + rows] new_pos = self.pos + rows - # Check for stop rows. n.b.: self.skiprows is a set. - if self.skiprows: - new_rows = [ - row - for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos) - ] - + new_rows = self._remove_skipped_rows(new_rows) lines.extend(new_rows) self.pos = new_pos @@ -1035,11 +1031,21 @@ def _get_lines(self, rows=None): new_rows = [] try: if rows is not None: - for _ in range(rows): + + rows_to_skip = 0 + if self.skiprows is not None and self.pos is not None: + # Only read additional rows if pos is in skiprows + rows_to_skip = len( + set(self.skiprows) - set(range(self.pos)) + ) + + for _ in range(rows + rows_to_skip): # assert for mypy, data is Iterator[str] or None, would # error in next assert self.data is not None new_rows.append(next(self.data)) + + new_rows = self._remove_skipped_rows(new_rows) lines.extend(new_rows) else: rows = 0 @@ -1052,12 +1058,7 @@ def _get_lines(self, rows=None): new_rows.append(new_row) except StopIteration: - if self.skiprows: - new_rows = [ - row - for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos) - ] + new_rows = self._remove_skipped_rows(new_rows) lines.extend(new_rows) if len(lines) == 0: raise @@ -1076,6 +1077,13 @@ def _get_lines(self, rows=None): lines = self._check_thousands(lines) return self._check_decimal(lines) + def _remove_skipped_rows(self, new_rows: list[Scalar]) -> list[Scalar]: + if self.skiprows: + return [ + row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos) + ] + return new_rows + class FixedWidthReader(abc.Iterator): """ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 8d1fa97f9f8bb..d4e33543d8a04 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -862,3 +862,18 @@ def test_colspecs_with_comment(): ) expected = DataFrame([[1, "K"]], columns=[0, 1]) tm.assert_frame_equal(result, expected) + + +def test_skip_rows_and_n_rows(): + # GH#44021 + data = """a\tb +1\t a +2\t b +3\t c +4\t d +5\t e +6\t f + """ + result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4]) + expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9df6bf42c55d2..627bda44016e9 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -256,3 +256,21 @@ def test_skip_rows_bad_callable(all_parsers): with pytest.raises(ZeroDivisionError, match=msg): parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) + + +def test_skip_rows_and_n_rows(all_parsers): + # GH#44021 + data = """a,b +1,a +2,b +3,c +4,d +5,e +6,f +7,g +8,h +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) + expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) + tm.assert_frame_equal(result, expected) From 92c935783b42f814c58fdc50a714f6825521aa47 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 13 Nov 2021 22:37:12 +0100 Subject: [PATCH 2/2] Adjust type hint --- pandas/io/parsers/python_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 008f0fc861df6..36387f0835f4a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1077,7 +1077,7 @@ def _get_lines(self, rows=None): lines = self._check_thousands(lines) return self._check_decimal(lines) - def _remove_skipped_rows(self, new_rows: list[Scalar]) -> list[Scalar]: + def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]: if self.skiprows: return [ row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)