From 59ebf0a186500b94b6e3b2a6225b92a914ffd6af Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 28 Nov 2021 22:35:03 +0100 Subject: [PATCH 1/4] Deprecate raising in read_csv when header row contains only empty cells --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/base_parser.py | 7 +++++++ pandas/tests/io/parser/test_header.py | 7 +++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 50156d4565bbd..43fdc68e384f8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -470,6 +470,7 @@ Other Deprecations - Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`) - Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`) - Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`) +- Deprecated raising an error when header row contains only empty cells in :func:`read_csv` (:issue:`13054`) - Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`) - A deprecation warning is now shown for :meth:`DataFrame.to_latex` indicating the arguments signature may change and emulate more the arguments to :meth:`.Styler.to_latex` in future versions (:issue:`44411`) - diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 4f5ba3460a3c8..226cfe0c455d1 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -381,6 +381,13 @@ def extract(r): for n in range(len(columns[0])): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): header = ",".join([str(x) for x in self.header]) + warnings.warn( + f"The passed header=[{header}] has at least one line without data. " + "This will return a MultiIndex in the future where at least one " + "level consists of only Unnamed: entries.", + FutureWarning, + stacklevel=find_stack_level(), + ) raise ParserError( f"Passed header=[{header}] are too many rows " "for this multi_index of columns" diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index dc3792989357a..637bee84ad599 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -557,8 +557,11 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): r"Passed header=\[0,1\] are too " r"many rows for this multi_index of columns" ) - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=header, index_col=index_col) + with tm.assert_produces_warning( + FutureWarning, match="consists of only Unnamed", check_stacklevel=False + ): + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=header, index_col=index_col) else: result = parser.read_csv(StringIO(data), header=header, index_col=index_col) exp_columns = [] From 4ce71f00f27d5692a64dd31080646bcfc5d127de Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 15 Dec 2021 11:47:32 +0100 Subject: [PATCH 2/4] Stop raising immediately --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/io/parsers/base_parser.py | 18 ---------------- pandas/tests/io/parser/test_header.py | 30 ++++++++++----------------- 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 43fdc68e384f8..9d42fef70c9c7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -402,6 +402,7 @@ Other API changes - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`) - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`) +- :func:`read_csv` no longer raising an error when one of the header rows consists only of ``Unnamed: `` columns (:issue:`13054`) - .. --------------------------------------------------------------------------- @@ -470,7 +471,6 @@ Other Deprecations - Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`) - Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`) - Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`) -- Deprecated raising an error when header row contains only empty cells in :func:`read_csv` (:issue:`13054`) - Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`) - A deprecation warning is now shown for :meth:`DataFrame.to_latex` indicating the arguments signature may change and emulate more the arguments to :meth:`.Styler.to_latex` in future versions (:issue:`44411`) - diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 226cfe0c455d1..4cfc8ebee4d1a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -39,7 +39,6 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( ensure_object, - ensure_str, is_bool_dtype, is_categorical_dtype, is_dict_like, @@ -376,23 +375,6 @@ def extract(r): columns = list(zip(*(extract(r) for r in header))) names = ic + columns - # If we find unnamed columns all in a single - # level, then our header was too long. - for n in range(len(columns[0])): - if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): - header = ",".join([str(x) for x in self.header]) - warnings.warn( - f"The passed header=[{header}] has at least one line without data. " - "This will return a MultiIndex in the future where at least one " - "level consists of only Unnamed: entries.", - FutureWarning, - stacklevel=find_stack_level(), - ) - raise ParserError( - f"Passed header=[{header}] are too many rows " - "for this multi_index of columns" - ) - # Clean the column names (if we have an index_col). if len(ic): col_names = [ diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 637bee84ad599..dd3bd0ee10ffa 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -552,29 +552,21 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): else: data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) + exp_columns = [] + if columns is None: - msg = ( - r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns" - ) - with tm.assert_produces_warning( - FutureWarning, match="consists of only Unnamed", check_stacklevel=False - ): - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=header, index_col=index_col) - else: - result = parser.read_csv(StringIO(data), header=header, index_col=index_col) - exp_columns = [] + columns = ["", "", ""] - for i, col in enumerate(columns): - if not col: # Unnamed. - col = f"Unnamed: {i if index_col is None else i + 1}_level_0" + for i, col in enumerate(columns): + if not col: # Unnamed. + col = f"Unnamed: {i if index_col is None else i + 1}_level_0" - exp_columns.append(col) + exp_columns.append(col) - columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) - expected = DataFrame([[2, 3], [4, 5]], columns=columns) - tm.assert_frame_equal(result, expected) + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) @skip_pyarrow From 3fa586b40ebd04f7761ad404b959df1908eada5e Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 15 Dec 2021 12:31:17 +0100 Subject: [PATCH 3/4] Adjust whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/tests/io/test_html.py | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 6da9829531b93..e0a7d28408d95 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -454,7 +454,7 @@ Other API changes - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`) - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`) -- :func:`read_csv` no longer raising an error when one of the header rows consists only of ``Unnamed: `` columns (:issue:`13054`) +- :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed: `` columns (:issue:`13054`) - Changed the ``name`` attribute of several holidays in ``USFederalHolidayCalendar`` to match `official federal holiday names `_ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index d4b78d8371ede..3aac7e95e6591 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -14,7 +14,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.errors import ParserError import pandas.util._test_decorators as td from pandas import ( @@ -918,13 +917,8 @@ def test_wikipedia_states_multiindex(self, datapath): assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) def test_parser_error_on_empty_header_row(self): - msg = ( - r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns" - ) - with pytest.raises(ParserError, match=msg): - self.read_html( - """ + result = self.read_html( + """ @@ -935,8 +929,15 @@ def test_parser_error_on_empty_header_row(self):
""", - header=[0, 1], - ) + header=[0, 1], + ) + expected = DataFrame( + [["a", "b"]], + columns=MultiIndex.from_tuples( + [("Unnamed: 0_level_0", "A"), ("Unnamed: 1_level_0", "B")] + ), + ) + tm.assert_frame_equal(result[0], expected) def test_decimal_rows(self): # GH 12907 From 4d80ca35bad48c77998fbed80e7752664bcb378e Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 18 Dec 2021 21:42:52 +0100 Subject: [PATCH 4/4] Fix whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8ae5665c4283f..e189774161736 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -454,7 +454,7 @@ Other API changes - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`) - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`) -- :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed: `` columns (:issue:`13054`) +- :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed:`` columns (:issue:`13054`) - Changed the ``name`` attribute of several holidays in ``USFederalHolidayCalendar`` to match `official federal holiday names `_