diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f0af60f80edd5..4b36f481acf50 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -307,6 +307,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) +- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) - diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c639a4a9d494e..5a4a82bd341f1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -756,6 +756,24 @@ def read_fwf( colspecs.append((col, col + w)) col += w + # GH#40830 + # Ensure length of `colspecs` matches length of `names` + names = kwds.get("names") + if names is not None: + if len(names) != len(colspecs): + # need to check len(index_col) as it might contain + # unnamed indices, in which case it's name is not required + len_index = 0 + if kwds.get("index_col") is not None: + index_col: Any = kwds.get("index_col") + if index_col is not False: + if not is_list_like(index_col): + len_index = 1 + else: + len_index = len(index_col) + if len(names) + len_index != len(colspecs): + raise ValueError("Length of colspecs must match length of names") + kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 9739a2a75886a..6b136618de721 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -710,3 +710,146 @@ def test_encoding_mmap(memory_map): data.seek(0) df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) + + +@pytest.mark.parametrize( + "colspecs, names, widths, index_col", + [ + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + None, + ), + ( + None, + list("abcde"), + [6] * 4, + None, + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + True, + ), + ( + None, + list("abcde"), + [6] * 4, + False, + ), + ( + None, + list("abcde"), + [6] * 4, + True, + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + False, + ), + ], +) +def test_len_colspecs_len_names(colspecs, names, widths, index_col): + # GH#40830 + data = """col1 col2 col3 col4 + bab ba 2""" + msg = "Length of colspecs must match length of names" + with pytest.raises(ValueError, match=msg): + read_fwf( + StringIO(data), + colspecs=colspecs, + names=names, + widths=widths, + index_col=index_col, + ) + + +@pytest.mark.parametrize( + "colspecs, names, widths, index_col, expected", + [ + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abc"), + None, + 0, + DataFrame( + index=["col1", "ba"], + columns=["a", "b", "c"], + data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], + ), + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("ab"), + None, + [0, 1], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"]], + columns=["a", "b"], + data=[["col3", "col4"], ["2", np.nan]], + ), + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("a"), + None, + [0, 1, 2], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], + columns=["a"], + data=[["col4"], [np.nan]], + ), + ), + ( + None, + list("abc"), + [6] * 4, + 0, + DataFrame( + index=["col1", "ba"], + columns=["a", "b", "c"], + data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], + ), + ), + ( + None, + list("ab"), + [6] * 4, + [0, 1], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"]], + columns=["a", "b"], + data=[["col3", "col4"], ["2", np.nan]], + ), + ), + ( + None, + list("a"), + [6] * 4, + [0, 1, 2], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], + columns=["a"], + data=[["col4"], [np.nan]], + ), + ), + ], +) +def test_len_colspecs_len_names_with_index_col( + colspecs, names, widths, index_col, expected +): + # GH#40830 + data = """col1 col2 col3 col4 + bab ba 2""" + result = read_fwf( + StringIO(data), + colspecs=colspecs, + names=names, + widths=widths, + index_col=index_col, + ) + tm.assert_frame_equal(result, expected)