BUG: read_csv and read_fwf not skipping all defined rows when nrows is given (#44434)

phofl · web-flow · commit eac5129ae0ae · 2021-11-15T19:55:58.000-05:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -635,6 +635,7 @@ I/O
 - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
 - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
 - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
+- Bug in :func:`read_csv` and :func:`read_fwf` ignoring all ``skiprows`` except first when ``nrows`` is specified for ``engine='python'`` (:issue:`44021`)
 - Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`)
 - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`)
 - Bug in :func:`json_normalize` where reading data with missing multi-level metadata would not respect errors="ignore" (:issue:`44312`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -19,7 +19,10 @@
 import numpy as np
 
 import pandas._libs.lib as lib
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import (
+    FilePathOrBuffer,
+    Scalar,
+)
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -1020,26 +1023,29 @@ def _get_lines(self, rows=None):
                     new_rows = self.data[self.pos : self.pos + rows]
                     new_pos = self.pos + rows
 
-                # Check for stop rows. n.b.: self.skiprows is a set.
-                if self.skiprows:
-                    new_rows = [
-                        row
-                        for i, row in enumerate(new_rows)
-                        if not self.skipfunc(i + self.pos)
-                    ]
-
+                new_rows = self._remove_skipped_rows(new_rows)
                 lines.extend(new_rows)
                 self.pos = new_pos
 
             else:
                 new_rows = []
                 try:
                     if rows is not None:
-                        for _ in range(rows):
+
+                        rows_to_skip = 0
+                        if self.skiprows is not None and self.pos is not None:
+                            # Only read additional rows if pos is in skiprows
+                            rows_to_skip = len(
+                                set(self.skiprows) - set(range(self.pos))
+                            )
+
+                        for _ in range(rows + rows_to_skip):
                             # assert for mypy, data is Iterator[str] or None, would
                             # error in next
                             assert self.data is not None
                             new_rows.append(next(self.data))
+
+                        new_rows = self._remove_skipped_rows(new_rows)
                         lines.extend(new_rows)
                     else:
                         rows = 0
@@ -1052,12 +1058,7 @@ def _get_lines(self, rows=None):
                                 new_rows.append(new_row)
 
                 except StopIteration:
-                    if self.skiprows:
-                        new_rows = [
-                            row
-                            for i, row in enumerate(new_rows)
-                            if not self.skipfunc(i + self.pos)
-                        ]
+                    new_rows = self._remove_skipped_rows(new_rows)
                     lines.extend(new_rows)
                     if len(lines) == 0:
                         raise
@@ -1076,6 +1077,13 @@ def _get_lines(self, rows=None):
         lines = self._check_thousands(lines)
         return self._check_decimal(lines)
 
+    def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]:
+        if self.skiprows:
+            return [
+                row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)
+            ]
+        return new_rows
+
 
 class FixedWidthReader(abc.Iterator):
     """
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -862,3 +862,18 @@ def test_colspecs_with_comment():
     )
     expected = DataFrame([[1, "K"]], columns=[0, 1])
     tm.assert_frame_equal(result, expected)
+
+
+def test_skip_rows_and_n_rows():
+    # GH#44021
+    data = """a\tb
+1\t a
+2\t b
+3\t c
+4\t d
+5\t e
+6\t f
+    """
+    result = read_fwf(StringIO(data), nrows=4, skiprows=[2, 4])
+    expected = DataFrame({"a": [1, 3, 5, 6], "b": ["a", "c", "e", "f"]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
@@ -256,3 +256,21 @@ def test_skip_rows_bad_callable(all_parsers):
 
     with pytest.raises(ZeroDivisionError, match=msg):
         parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
+
+
+def test_skip_rows_and_n_rows(all_parsers):
+    # GH#44021
+    data = """a,b
+1,a
+2,b
+3,c
+4,d
+5,e
+6,f
+7,g
+8,h
+"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
+    expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
+    tm.assert_frame_equal(result, expected)