From ff68dbdc59468ec5698441b36c000b6e4be4127b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 13 Nov 2018 21:32:57 -0800 Subject: [PATCH] BUG: 'Unnamed' != unnamed column in CSV False criterion was causing errors when specified headers appeared to capture a seemingly unnamed row, just because they had the string "Unnamed" in it. --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/io/parsers.py | 14 +++++----- pandas/tests/io/parser/header.py | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 007f5b7feb060..4ca65975605c3 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1359,6 +1359,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) - Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) +- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`) - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9fd35effe1b07..7bf570fe439b0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1387,22 +1387,20 @@ def extract(r): columns = lzip(*[extract(r) for r in header]) names = ic + columns - def tostr(x): - return str(x) if not isinstance(x, compat.string_types) else x - - # if we find 'Unnamed' all of a single level, then our header was too - # long + # If we find unnamed columns all in a single + # level, then our header was too long. for n in range(len(columns[0])): - if all('Unnamed' in tostr(c[n]) for c in columns): + if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns): raise ParserError( "Passed header=[%s] are too many rows for this " "multi_index of columns" % ','.join(str(x) for x in self.header) ) - # clean the column names (if we have an index_col) + # Clean the column names (if we have an index_col). if len(ic): - col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None + col_names = [r[0] if (len(r[0]) and + r[0] not in self.unnamed_cols) else None for r in header] else: col_names = [None] * len(header) diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py index fe7a16e6447b3..2191fdceb6928 100644 --- a/pandas/tests/io/parser/header.py +++ b/pandas/tests/io/parser/header.py @@ -11,6 +11,7 @@ import pytest from pandas.compat import StringIO, lrange, u +from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex import pandas.util.testing as tm @@ -360,3 +361,47 @@ def test_mangles_multi_index(self): ('A', 'one.1.1'), ('B', 'two'), ('B', 'two.1')])) tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("index_col", [None, [0]]) + @pytest.mark.parametrize("columns", [None, + (["", "Unnamed"]), + (["Unnamed", ""]), + (["Unnamed", "NotUnnamed"])]) + def test_multi_index_unnamed(self, index_col, columns): + # see gh-23687 + # + # When specifying a multi-index header, make sure that + # we don't error just because one of the rows in our header + # has ALL column names containing the string "Unnamed". The + # correct condition to check is whether the row contains + # ALL columns that did not have names (and instead were given + # placeholder ones). + header = [0, 1] + + if index_col is None: + data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" + else: + data = (",".join([""] + (columns or ["", ""])) + + "\n,0,1\n0,2,3\n1,4,5\n") + + if columns is None: + msg = (r"Passed header=\[0,1\] are too " + r"many rows for this multi_index of columns") + with pytest.raises(ParserError, match=msg): + self.read_csv(StringIO(data), header=header, + index_col=index_col) + else: + result = self.read_csv(StringIO(data), header=header, + index_col=index_col) + template = "Unnamed: {i}_level_0" + exp_columns = [] + + for i, col in enumerate(columns): + if not col: # Unnamed. + col = template.format(i=i if index_col is None else i + 1) + + exp_columns.append(col) + + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected)