Skip to content

BUG: 'Unnamed' != unnamed column in CSV #23687

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1359,6 +1359,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`)
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
Expand Down
14 changes: 6 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,22 +1387,20 @@ def extract(r):
columns = lzip(*[extract(r) for r in header])
names = ic + columns

def tostr(x):
return str(x) if not isinstance(x, compat.string_types) else x

# if we find 'Unnamed' all of a single level, then our header was too
# long
# If we find unnamed columns all in a single
# level, then our header was too long.
for n in range(len(columns[0])):
if all('Unnamed' in tostr(c[n]) for c in columns):
if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns):
raise ParserError(
"Passed header=[%s] are too many rows for this "
"multi_index of columns"
% ','.join(str(x) for x in self.header)
)

# clean the column names (if we have an index_col)
# Clean the column names (if we have an index_col).
if len(ic):
col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None
col_names = [r[0] if (len(r[0]) and
r[0] not in self.unnamed_cols) else None
for r in header]
else:
col_names = [None] * len(header)
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/io/parser/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pytest

from pandas.compat import StringIO, lrange, u
from pandas.errors import ParserError

from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
Expand Down Expand Up @@ -360,3 +361,47 @@ def test_mangles_multi_index(self):
('A', 'one.1.1'), ('B', 'two'),
('B', 'two.1')]))
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize("index_col", [None, [0]])
@pytest.mark.parametrize("columns", [None,
(["", "Unnamed"]),
(["Unnamed", ""]),
(["Unnamed", "NotUnnamed"])])
def test_multi_index_unnamed(self, index_col, columns):
# see gh-23687
#
# When specifying a multi-index header, make sure that
# we don't error just because one of the rows in our header
# has ALL column names containing the string "Unnamed". The
# correct condition to check is whether the row contains
# ALL columns that did not have names (and instead were given
# placeholder ones).
header = [0, 1]

if index_col is None:
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
else:
data = (",".join([""] + (columns or ["", ""])) +
"\n,0,1\n0,2,3\n1,4,5\n")

if columns is None:
msg = (r"Passed header=\[0,1\] are too "
r"many rows for this multi_index of columns")
with pytest.raises(ParserError, match=msg):
self.read_csv(StringIO(data), header=header,
index_col=index_col)
else:
result = self.read_csv(StringIO(data), header=header,
index_col=index_col)
template = "Unnamed: {i}_level_0"
exp_columns = []

for i, col in enumerate(columns):
if not col: # Unnamed.
col = template.format(i=i if index_col is None else i + 1)

exp_columns.append(col)

columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
tm.assert_frame_equal(result, expected)