From 77ec32ef58c307f00f5f2d0ae67c82a0bbc1233b Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Jun 2022 10:29:09 +0200 Subject: [PATCH 1/3] BUG: read_csv may interpret second row as index names even if header is integer --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/parsers/python_parser.py | 8 +++++++- pandas/io/parsers/readers.py | 6 +++++- pandas/tests/io/parser/test_python_parser_only.py | 13 ++++++++++++- 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 76f6e864a174f..b1ea7b1774d79 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -861,6 +861,7 @@ I/O - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`) - Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`) - Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`) +- Bug in :func:`read_csv` interpreting second row as :class:`Index` names even when ``header`` is given as an integer (:issue:`46569`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 37b2ce4c4148b..ca47a9ec7a1b3 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -101,6 +101,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.comment = kwds["comment"] + self._passed_header = kwds.get("_header_passed") + # Set self.data to something that can read lines. if isinstance(f, list): # read_excel: f is a list @@ -933,7 +935,11 @@ def _get_index_name( implicit_first_cols = len(line) - self.num_original_columns # Case 0 - if next_line is not None and self.header is not None: + if ( + next_line is not None + and self.header is not None + and not isinstance(self._passed_header, int) + ): if len(next_line) == len(line) + self.num_original_columns: # column and index names on diff rows self.index_col = list(range(len(line))) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 867cdf0ee7636..6ea5eddf079ff 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1396,7 +1396,8 @@ def __init__( ) kwds = _merge_with_dialect_properties(dialect, kwds) - if kwds.get("header", "infer") == "infer": + header = kwds.get("header", "infer") + if header == "infer": kwds["header"] = 0 if kwds.get("names") is None else None self.orig_options = kwds @@ -1418,6 +1419,9 @@ def __init__( if "has_index_names" in kwds: self.options["has_index_names"] = kwds["has_index_names"] + if self.engine == "python": + self.options["_header_passed"] = header + self.handles: IOHandles | None = None self._engine = self._make_engine(f, self.engine) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index abe6c831dd4e4..8e36d609df49e 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -466,6 +466,17 @@ def test_index_col_false_and_header_none(python_parser_only): 0.5,0.03 0.1,0.2,0.3,2 """ - result = parser.read_csv(StringIO(data), sep=",", header=None, index_col=False) + with tm.assert_produces_warning(ParserWarning, match="Length of header"): + result = parser.read_csv(StringIO(data), sep=",", header=None, index_col=False) expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]}) tm.assert_frame_equal(result, expected) + + +def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only): + # GH#46569 + parser = python_parser_only + data = StringIO("a\na,b\nc,d,e\nf,g,h") + with tm.assert_produces_warning(ParserWarning, match="Length of header"): + result = parser.read_csv(data, engine="python", index_col=False, header=0) + expected = DataFrame({"a": ["a", "c", "f"]}) + tm.assert_frame_equal(result, expected) From a724810cdc1dd22ad16475ff391238290812dbfb Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Jun 2022 10:35:57 +0200 Subject: [PATCH 2/3] BUG: read_csv may interpret second row as index names even if index_col is False --- pandas/io/parsers/python_parser.py | 4 +--- pandas/io/parsers/readers.py | 6 +----- pandas/tests/io/parser/test_python_parser_only.py | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index ca47a9ec7a1b3..2bf66923ddd60 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -101,8 +101,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.comment = kwds["comment"] - self._passed_header = kwds.get("_header_passed") - # Set self.data to something that can read lines. if isinstance(f, list): # read_excel: f is a list @@ -938,7 +936,7 @@ def _get_index_name( if ( next_line is not None and self.header is not None - and not isinstance(self._passed_header, int) + and index_col is not False ): if len(next_line) == len(line) + self.num_original_columns: # column and index names on diff rows diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6ea5eddf079ff..867cdf0ee7636 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1396,8 +1396,7 @@ def __init__( ) kwds = _merge_with_dialect_properties(dialect, kwds) - header = kwds.get("header", "infer") - if header == "infer": + if kwds.get("header", "infer") == "infer": kwds["header"] = 0 if kwds.get("names") is None else None self.orig_options = kwds @@ -1419,9 +1418,6 @@ def __init__( if "has_index_names" in kwds: self.options["has_index_names"] = kwds["has_index_names"] - if self.engine == "python": - self.options["_header_passed"] = header - self.handles: IOHandles | None = None self._engine = self._make_engine(f, self.engine) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 8e36d609df49e..0717078a83a46 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -477,6 +477,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse parser = python_parser_only data = StringIO("a\na,b\nc,d,e\nf,g,h") with tm.assert_produces_warning(ParserWarning, match="Length of header"): - result = parser.read_csv(data, engine="python", index_col=False, header=0) + result = parser.read_csv(data, engine="python", index_col=False) expected = DataFrame({"a": ["a", "c", "f"]}) tm.assert_frame_equal(result, expected) From bf175acf7e61922a785742c2884df334fd5f1741 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Jun 2022 10:36:36 +0200 Subject: [PATCH 3/3] BUG: read_csv may interpret second row as index names even if index_col is False --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b1ea7b1774d79..2346a86ad21f8 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -861,7 +861,7 @@ I/O - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`) - Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`) - Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`) -- Bug in :func:`read_csv` interpreting second row as :class:`Index` names even when ``header`` is given as an integer (:issue:`46569`) +- Bug in :func:`read_csv` interpreting second row as :class:`Index` names even when ``index_col=False`` (:issue:`46569`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`)