From b8200e4c35a9d41dabfa791e189ec01041da6988 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Sat, 5 Nov 2016 22:05:18 -0400 Subject: [PATCH 01/12] BUG: read_csv with empty df read_csv would fail on files if the number of header lines passed in includes all the lines in the files. This commit fixes that bug. --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/tests/parser/common.py | 18 ++++++++++++++++++ pandas/parser.pyx | 6 ++++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index db5bd22393e64..595fda7086811 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -57,5 +57,6 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) +- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4cb00c48976a4..6c25c9875d8c5 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -606,6 +606,24 @@ def test_multi_index_no_level_names(self): expected = self.read_csv(StringIO(data), index_col=[1, 0]) tm.assert_frame_equal(df, expected, check_names=False) + def test_multi_index_blank_df(self): + # GH 14545 + data = """a,b +""" + df = self.read_csv(StringIO(data), header=[0]) + expected = DataFrame(columns=[('a'),('b')]) + tm.assert_frame_equal(df, expected) + expected_csv = expected.to_csv() + round_trip = self.read_csv(StringIO(expected_csv)) + tm.assert_frame_equal(expected, round_trip) + + data_multiline = """a,b +c,d +""" + df2 = self.read_csv(StringIO(data_multiline), header=[0,1]) + expected2 = DataFrame(columns=[('a', 'c'), ('b', 'd')]) + tm.assert_frame_equal(df2, expected2) + def test_no_unnamed_index(self): data = """ id c0 c1 c2 0 1 0 a b diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 9fb99637731be..af3e19ba8d4ee 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -717,7 +717,9 @@ cdef class TextReader: start = self.parser.line_start[0] # e.g., if header=3 and file only has 2 lines - elif self.parser.lines < hr + 1: + if (self.parser.lines < hr + 1 + and not isinstance(self.orig_header, list)) or ( + self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): msg = "[%s], len of %d," % ( @@ -940,7 +942,7 @@ cdef class TextReader: raise_parser_error('Error tokenizing data', self.parser) footer = self.skipfooter - if self.parser_start == self.parser.lines: + if self.parser_start >= self.parser.lines: raise StopIteration self._end_clock('Tokenization') From 2f64d578506ab86321aa7ec5c41a5c2b3fd92e90 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Sat, 5 Nov 2016 22:11:48 -0400 Subject: [PATCH 02/12] pep8 --- pandas/io/tests/parser/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 6c25c9875d8c5..2db14a1685608 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -611,7 +611,7 @@ def test_multi_index_blank_df(self): data = """a,b """ df = self.read_csv(StringIO(data), header=[0]) - expected = DataFrame(columns=[('a'),('b')]) + expected = DataFrame(columns=[('a'), ('b')]) tm.assert_frame_equal(df, expected) expected_csv = expected.to_csv() round_trip = self.read_csv(StringIO(expected_csv)) @@ -620,7 +620,7 @@ def test_multi_index_blank_df(self): data_multiline = """a,b c,d """ - df2 = self.read_csv(StringIO(data_multiline), header=[0,1]) + df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) expected2 = DataFrame(columns=[('a', 'c'), ('b', 'd')]) tm.assert_frame_equal(df2, expected2) From bfe0423921fa11bc5d22caca8e09cea1ac3543b0 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Mon, 7 Nov 2016 10:10:18 -0500 Subject: [PATCH 03/12] typo --- pandas/parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index af3e19ba8d4ee..3376b338685f1 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -717,7 +717,7 @@ cdef class TextReader: start = self.parser.line_start[0] # e.g., if header=3 and file only has 2 lines - if (self.parser.lines < hr + 1 + elif (self.parser.lines < hr + 1 and not isinstance(self.orig_header, list)) or ( self.parser.lines < hr): msg = self.orig_header From 72adaf210895d379ba57875103610782a2bedea3 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Mon, 7 Nov 2016 12:23:02 -0500 Subject: [PATCH 04/12] remove unnecessary test --- pandas/io/tests/parser/common.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 2db14a1685608..0cb4dffe0e139 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -611,11 +611,8 @@ def test_multi_index_blank_df(self): data = """a,b """ df = self.read_csv(StringIO(data), header=[0]) - expected = DataFrame(columns=[('a'), ('b')]) + expected = DataFrame(columns=[('a',), ('b',)]) tm.assert_frame_equal(df, expected) - expected_csv = expected.to_csv() - round_trip = self.read_csv(StringIO(expected_csv)) - tm.assert_frame_equal(expected, round_trip) data_multiline = """a,b c,d From 17e44dd3e49f294f9518ac2173eaa06489faf997 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Fri, 11 Nov 2016 09:23:38 -0500 Subject: [PATCH 05/12] fix python parser too --- pandas/io/parsers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 092cba093421a..4a501573c8cc4 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2083,6 +2083,12 @@ def _infer_columns(self): # We have an empty file, so check # if columns are provided. That will # serve as the 'line' for parsing + if have_mi_columns: + if clear_buffer: + self._clear_buffer() + columns.append([None] * len(this_columns)) + return columns, num_original_columns + if not self.names: raise EmptyDataError( "No columns to parse from file") From 68eadf3afaa4815c97e5875acc4f0f8202048e0f Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Fri, 11 Nov 2016 10:37:16 -0500 Subject: [PATCH 06/12] Modify test. A test in test_to_csv checked for the presence of exactly the behavior we're fixing here: A file with 5 lines that asks for a header of length 5 should work and return an empty dataframe, not error. --- pandas/tests/frame/test_to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4d6a5bb32038d..1eb3454519ce3 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -587,7 +587,7 @@ def _make_frame(names=None): df = _make_frame(True) df.to_csv(path, tupleize_cols=False) - for i in [5, 6, 7]: + for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with assertRaisesRegexp(ParserError, msg): read_csv(path, tupleize_cols=False, From 3d9bbddea2e22d4ee86111a44cf21cbbfce12895 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Fri, 11 Nov 2016 10:39:54 -0500 Subject: [PATCH 07/12] whatsnew --- doc/source/whatsnew/v0.19.1.txt | 1 - doc/source/whatsnew/v0.20.0.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 595fda7086811..db5bd22393e64 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -57,6 +57,5 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) -- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 581106924c77e..e4cd76c6da36a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -81,3 +81,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`) From fc23e5c899e78a6cdaf8a6d420eb8ceffe3f7584 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Sat, 12 Nov 2016 21:47:18 -0500 Subject: [PATCH 08/12] fix errant this_columns --- pandas/io/parsers.py | 4 ++-- pandas/parser.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4a501573c8cc4..fa1904a8fb955 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2083,10 +2083,10 @@ def _infer_columns(self): # We have an empty file, so check # if columns are provided. That will # serve as the 'line' for parsing - if have_mi_columns: + if have_mi_columns and hr > 0: if clear_buffer: self._clear_buffer() - columns.append([None] * len(this_columns)) + columns.append([None] * len(columns[-1])) return columns, num_original_columns if not self.names: diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 3376b338685f1..0b1c9eba63ba7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -718,7 +718,7 @@ cdef class TextReader: # e.g., if header=3 and file only has 2 lines elif (self.parser.lines < hr + 1 - and not isinstance(self.orig_header, list)) or ( + and not isinstance(self.orig_header, list)) or ( self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): From 518982d1890ae8a58d159cca91ab45242d3b1880 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Sun, 13 Nov 2016 08:28:26 -0500 Subject: [PATCH 09/12] move to 0.19.2 --- doc/source/whatsnew/v0.19.2.txt | 2 +- doc/source/whatsnew/v0.20.0.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index ecbd6e9b3b288..a58e3499ac38f 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -29,7 +29,7 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - +- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e4cd76c6da36a..581106924c77e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -81,4 +81,3 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`) From fedfff8231d53045ec6900b85f29d2e5863ab70b Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Tue, 15 Nov 2016 10:14:10 -0500 Subject: [PATCH 10/12] fix multiindex column parsing --- pandas/io/parsers.py | 6 ++++-- pandas/io/tests/parser/common.py | 12 ++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fa1904a8fb955..13b67068985f3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1509,10 +1509,10 @@ def read(self, nrows=None): if self._first_chunk: self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) - index, columns, col_dict = _get_empty_meta( names, self.index_col, self.index_names, dtype=self.kwds.get('dtype')) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1979,8 +1979,10 @@ def read(self, rows=None): if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) - return _get_empty_meta(names, self.index_col, + index, columns, col_dict = _get_empty_meta(names, self.index_col, self.index_names) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + return index, columns, col_dict # handle new style for names in index count_empty_content_vals = count_empty_vals(content[0]) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 0cb4dffe0e139..941cd9db8c71a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -611,15 +611,23 @@ def test_multi_index_blank_df(self): data = """a,b """ df = self.read_csv(StringIO(data), header=[0]) - expected = DataFrame(columns=[('a',), ('b',)]) + expected = DataFrame(columns=['a', 'b']) tm.assert_frame_equal(df, expected) + round_trip = self.read_csv(StringIO( + expected.to_csv(index=False)), header=[0]) + tm.assert_frame_equal(round_trip, expected) data_multiline = """a,b c,d """ df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) - expected2 = DataFrame(columns=[('a', 'c'), ('b', 'd')]) + cols = MultiIndex.from_tuples([('a','c'), ('b', 'd')]) + expected2 = DataFrame(columns=cols) tm.assert_frame_equal(df2, expected2) + round_trip = self.read_csv(StringIO( + expected2.to_csv(index=False)), header=[0, 1]) + tm.assert_frame_equal(round_trip, expected2) + def test_no_unnamed_index(self): data = """ id c0 c1 c2 From e6b1237791fa8392f359a89a6a8a6a4f4da5aed3 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Tue, 15 Nov 2016 10:53:27 -0500 Subject: [PATCH 11/12] lint --- pandas/io/parsers.py | 10 ++++++---- pandas/io/tests/parser/common.py | 3 +-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 13b67068985f3..3fe5e5e826ebd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1512,7 +1512,8 @@ def read(self, nrows=None): index, columns, col_dict = _get_empty_meta( names, self.index_col, self.index_names, dtype=self.kwds.get('dtype')) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) + columns = self._maybe_make_multi_index_columns( + columns, self.col_names) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1979,9 +1980,10 @@ def read(self, rows=None): if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) - index, columns, col_dict = _get_empty_meta(names, self.index_col, - self.index_names) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) + index, columns, col_dict = _get_empty_meta( + names, self.index_col, self.index_names) + columns = self._maybe_make_multi_index_columns( + columns, self.col_names) return index, columns, col_dict # handle new style for names in index diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 941cd9db8c71a..6eb73876c11dd 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -621,14 +621,13 @@ def test_multi_index_blank_df(self): c,d """ df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) - cols = MultiIndex.from_tuples([('a','c'), ('b', 'd')]) + cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) expected2 = DataFrame(columns=cols) tm.assert_frame_equal(df2, expected2) round_trip = self.read_csv(StringIO( expected2.to_csv(index=False)), header=[0, 1]) tm.assert_frame_equal(round_trip, expected2) - def test_no_unnamed_index(self): data = """ id c0 c1 c2 0 1 0 a b From 32e3b0a01930c48a51d1a7c209dee9724fd5ac19 Mon Sep 17 00:00:00 2001 From: Ben Kandel Date: Tue, 15 Nov 2016 13:19:38 -0500 Subject: [PATCH 12/12] lint --- pandas/parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 0b1c9eba63ba7..6b43dfbabc4a0 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -719,7 +719,7 @@ cdef class TextReader: # e.g., if header=3 and file only has 2 lines elif (self.parser.lines < hr + 1 and not isinstance(self.orig_header, list)) or ( - self.parser.lines < hr): + self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): msg = "[%s], len of %d," % (