From 95265477e5edace9a1ebf4bed3bb6e9db1d87f7b Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 7 Aug 2021 01:09:06 +0530 Subject: [PATCH 1/8] BUG: read_fwf raise if len colspec doesnt match len names --- pandas/io/parsers/readers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c639a4a9d494e..6828d6f3c2de2 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -756,6 +756,10 @@ def read_fwf( colspecs.append((col, col + w)) col += w + if kwds.get("names") is not None: + if len(kwds.get("names")) != len(colspecs): + raise ValueError("Length of colspecs must match length of names") + kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" From 4e813a0fedca597a6920a87459b24efc588471d4 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 7 Aug 2021 13:17:52 +0530 Subject: [PATCH 2/8] length of columns considered --- pandas/io/parsers/readers.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6828d6f3c2de2..c0ab9ded1e906 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -756,8 +756,17 @@ def read_fwf( colspecs.append((col, col + w)) col += w + # GH#40830 + # Ensure length of `colspecs` matches length of `names` if kwds.get("names") is not None: - if len(kwds.get("names")) != len(colspecs): + len_index = 0 + if kwds.get("index_col") is not None: + index_col = kwds.get("index_col") + if isinstance(index_col, int) or isinstance(index_col, str): + len_index = 1 + else: + len_index = len(index_col) + if len(kwds.get("names")) + len_index != len(colspecs): raise ValueError("Length of colspecs must match length of names") kwds["colspecs"] = colspecs From b126fb4fba81307dc74147cf00bb9aa152b13ddb Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 7 Aug 2021 15:26:16 +0530 Subject: [PATCH 3/8] added test; whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/io/parsers/readers.py | 27 ++++++++++++++++--------- pandas/tests/io/parser/test_read_fwf.py | 17 ++++++++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index fa9c424351b00..8d3d9fc41fa37 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -258,7 +258,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) -- +- Bug in :func:`read_fwf` where difference in length of ``colspecs`` and ``names`` did not throw any error (:issue:`40830`) Period ^^^^^^ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c0ab9ded1e906..203e5dd6a13b2 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -758,16 +758,23 @@ def read_fwf( # GH#40830 # Ensure length of `colspecs` matches length of `names` - if kwds.get("names") is not None: - len_index = 0 - if kwds.get("index_col") is not None: - index_col = kwds.get("index_col") - if isinstance(index_col, int) or isinstance(index_col, str): - len_index = 1 - else: - len_index = len(index_col) - if len(kwds.get("names")) + len_index != len(colspecs): - raise ValueError("Length of colspecs must match length of names") + names = kwds.get("names") + if names is not None: + if len(names) != len(colspecs): + # maybe name of index(s) not present in `names` + len_index = 0 + if kwds.get("index_col") is not None: + index_col = kwds.get("index_col") + if isinstance(index_col, int) or isinstance(index_col, str): + len_index = 1 + elif index_col is False: + len_index = 0 + else: + # error: Argument 1 to "len" has incompatible type + # "Optional[Any]"; expected "Sized" [arg-type] + len_index = len(index_col) # type: ignore[arg-type] + if len(names) + len_index != len(colspecs): + raise ValueError("Length of colspecs must match length of names") kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 9739a2a75886a..c54590992c22d 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -710,3 +710,20 @@ def test_encoding_mmap(memory_map): data.seek(0) df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) + + +@pytest.mark.parametrize( + "colspecs, names, widths", + [ + ([(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None), + ([6] * 4, list("abcde"), None), + (None, list("abcde"), [6] * 4), + ], +) +def test_len_colspecs_len_names(colspecs, names, widths): + # GH#40830 + data = """col1 col2 col3 col4 + bab ba 2""" + msg = "Length of colspecs must match length of names" + with pytest.raises(ValueError, match=msg): + read_fwf(StringIO(data), colspecs=colspecs, names=names, widths=widths) From e955b14379d4751cdcd06d829f54f771591c90f9 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 8 Aug 2021 14:00:13 +0530 Subject: [PATCH 4/8] suggested edits --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/io/parsers/readers.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 9 ++++++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8d3d9fc41fa37..47892ace8fddc 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -258,7 +258,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) -- Bug in :func:`read_fwf` where difference in length of ``colspecs`` and ``names`` did not throw any error (:issue:`40830`) +- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`) Period ^^^^^^ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 203e5dd6a13b2..ed51655d636e0 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -765,7 +765,7 @@ def read_fwf( len_index = 0 if kwds.get("index_col") is not None: index_col = kwds.get("index_col") - if isinstance(index_col, int) or isinstance(index_col, str): + if isinstance(index_col, (int, str)): len_index = 1 elif index_col is False: len_index = 0 diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index c54590992c22d..87a820470453b 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -716,7 +716,6 @@ def test_encoding_mmap(memory_map): "colspecs, names, widths", [ ([(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None), - ([6] * 4, list("abcde"), None), (None, list("abcde"), [6] * 4), ], ) @@ -727,3 +726,11 @@ def test_len_colspecs_len_names(colspecs, names, widths): msg = "Length of colspecs must match length of names" with pytest.raises(ValueError, match=msg): read_fwf(StringIO(data), colspecs=colspecs, names=names, widths=widths) + + +def test_index_col_True(): + data = """col1 col2 col3 col4 + bab ba 2""" + msg = "The value of index_col couldn't be 'True'" + with pytest.raises(ValueError, match=msg): + read_fwf(StringIO(data), index_col=True) From ae0cfaa805e00c3649f98e146ed2e01473e3e2c9 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 8 Aug 2021 23:13:52 +0530 Subject: [PATCH 5/8] included index_col params alts in test --- pandas/io/parsers/readers.py | 4 +- pandas/tests/io/parser/test_read_fwf.py | 65 +++++++++++++++++++------ 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ed51655d636e0..ce02f9e25e1a5 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -765,7 +765,9 @@ def read_fwf( len_index = 0 if kwds.get("index_col") is not None: index_col = kwds.get("index_col") - if isinstance(index_col, (int, str)): + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") + elif isinstance(index_col, (int, str)): len_index = 1 elif index_col is False: len_index = 0 diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 87a820470453b..5cf42378a3bc1 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -713,24 +713,61 @@ def test_encoding_mmap(memory_map): @pytest.mark.parametrize( - "colspecs, names, widths", + "colspecs, names, widths, index_col, msg", [ - ([(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None), - (None, list("abcde"), [6] * 4), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + None, + "Length of colspecs must match length of names", + ), + ( + None, + list("abcde"), + [6] * 4, + None, + "Length of colspecs must match length of names", + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + True, + "The value of index_col couldn't be 'True'", + ), + ( + None, + list("abcde"), + [6] * 4, + False, + "Length of colspecs must match length of names", + ), + ( + None, + list("abcde"), + [6] * 4, + True, + "The value of index_col couldn't be 'True'", + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abcde"), + None, + False, + "Length of colspecs must match length of names", + ), ], ) -def test_len_colspecs_len_names(colspecs, names, widths): +def test_len_colspecs_len_names(colspecs, names, widths, index_col, msg): # GH#40830 data = """col1 col2 col3 col4 bab ba 2""" - msg = "Length of colspecs must match length of names" - with pytest.raises(ValueError, match=msg): - read_fwf(StringIO(data), colspecs=colspecs, names=names, widths=widths) - - -def test_index_col_True(): - data = """col1 col2 col3 col4 - bab ba 2""" - msg = "The value of index_col couldn't be 'True'" with pytest.raises(ValueError, match=msg): - read_fwf(StringIO(data), index_col=True) + read_fwf( + StringIO(data), + colspecs=colspecs, + names=names, + widths=widths, + index_col=index_col, + ) From 28924f0ca17087916699c90173e1e0c9f645b35c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 9 Aug 2021 21:07:41 +0530 Subject: [PATCH 6/8] suggested changes --- pandas/io/parsers/readers.py | 15 +++++---------- pandas/tests/io/parser/test_read_fwf.py | 11 +++-------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ce02f9e25e1a5..024456dbd6d47 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -765,16 +765,11 @@ def read_fwf( len_index = 0 if kwds.get("index_col") is not None: index_col = kwds.get("index_col") - if index_col is True: - raise ValueError("The value of index_col couldn't be 'True'") - elif isinstance(index_col, (int, str)): - len_index = 1 - elif index_col is False: - len_index = 0 - else: - # error: Argument 1 to "len" has incompatible type - # "Optional[Any]"; expected "Sized" [arg-type] - len_index = len(index_col) # type: ignore[arg-type] + if index_col is not False: + if not is_list_like(index_col): + len_index = 1 + else: + len_index = len(index_col) if len(names) + len_index != len(colspecs): raise ValueError("Length of colspecs must match length of names") diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 5cf42378a3bc1..524438b4395b4 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -713,56 +713,51 @@ def test_encoding_mmap(memory_map): @pytest.mark.parametrize( - "colspecs, names, widths, index_col, msg", + "colspecs, names, widths, index_col", [ ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None, None, - "Length of colspecs must match length of names", ), ( None, list("abcde"), [6] * 4, None, - "Length of colspecs must match length of names", ), ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None, True, - "The value of index_col couldn't be 'True'", ), ( None, list("abcde"), [6] * 4, False, - "Length of colspecs must match length of names", ), ( None, list("abcde"), [6] * 4, True, - "The value of index_col couldn't be 'True'", ), ( [(0, 6), (6, 12), (12, 18), (18, None)], list("abcde"), None, False, - "Length of colspecs must match length of names", ), ], ) -def test_len_colspecs_len_names(colspecs, names, widths, index_col, msg): +def test_len_colspecs_len_names(colspecs, names, widths, index_col): # GH#40830 data = """col1 col2 col3 col4 bab ba 2""" + msg = "Length of colspecs must match length of names" with pytest.raises(ValueError, match=msg): read_fwf( StringIO(data), From d5c8c38e803095a7cc3d7708ca23d5f3659b23b1 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 9 Aug 2021 22:07:46 +0530 Subject: [PATCH 7/8] added type --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 024456dbd6d47..9ddea2cc263fb 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -764,7 +764,7 @@ def read_fwf( # maybe name of index(s) not present in `names` len_index = 0 if kwds.get("index_col") is not None: - index_col = kwds.get("index_col") + index_col: Any = kwds.get("index_col") if index_col is not False: if not is_list_like(index_col): len_index = 1 From 32d8092a6495be92db9e6c4e211e899a64ea0f1f Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 14 Aug 2021 23:40:10 +0530 Subject: [PATCH 8/8] changed comment; added test for index_col --- pandas/io/parsers/readers.py | 3 +- pandas/tests/io/parser/test_read_fwf.py | 87 +++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 9ddea2cc263fb..5a4a82bd341f1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -761,7 +761,8 @@ def read_fwf( names = kwds.get("names") if names is not None: if len(names) != len(colspecs): - # maybe name of index(s) not present in `names` + # need to check len(index_col) as it might contain + # unnamed indices, in which case it's name is not required len_index = 0 if kwds.get("index_col") is not None: index_col: Any = kwds.get("index_col") diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 524438b4395b4..6b136618de721 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -766,3 +766,90 @@ def test_len_colspecs_len_names(colspecs, names, widths, index_col): widths=widths, index_col=index_col, ) + + +@pytest.mark.parametrize( + "colspecs, names, widths, index_col, expected", + [ + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("abc"), + None, + 0, + DataFrame( + index=["col1", "ba"], + columns=["a", "b", "c"], + data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], + ), + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("ab"), + None, + [0, 1], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"]], + columns=["a", "b"], + data=[["col3", "col4"], ["2", np.nan]], + ), + ), + ( + [(0, 6), (6, 12), (12, 18), (18, None)], + list("a"), + None, + [0, 1, 2], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], + columns=["a"], + data=[["col4"], [np.nan]], + ), + ), + ( + None, + list("abc"), + [6] * 4, + 0, + DataFrame( + index=["col1", "ba"], + columns=["a", "b", "c"], + data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]], + ), + ), + ( + None, + list("ab"), + [6] * 4, + [0, 1], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"]], + columns=["a", "b"], + data=[["col3", "col4"], ["2", np.nan]], + ), + ), + ( + None, + list("a"), + [6] * 4, + [0, 1, 2], + DataFrame( + index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]], + columns=["a"], + data=[["col4"], [np.nan]], + ), + ), + ], +) +def test_len_colspecs_len_names_with_index_col( + colspecs, names, widths, index_col, expected +): + # GH#40830 + data = """col1 col2 col3 col4 + bab ba 2""" + result = read_fwf( + StringIO(data), + colspecs=colspecs, + names=names, + widths=widths, + index_col=index_col, + ) + tm.assert_frame_equal(result, expected)