diff --git a/doc/source/release.rst b/doc/source/release.rst index de9743bdc705a..c147a83032761 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -183,6 +183,7 @@ Improvements to existing features - ``Series`` now supports a ``to_frame`` method to convert it to a single-column DataFrame (:issue:`5164`) - DatetimeIndex (and date_range) can now be constructed in a left- or right-open fashion using the ``closed`` parameter (:issue:`4579`) + - Python csv parser now supports usecols (:issue:`4335`) API Changes ~~~~~~~~~~~ @@ -625,6 +626,8 @@ Bug Fixes - Fixed bug in Excel writers where frames with duplicate column names weren't written correctly. (:issue:`5235`) - Fixed issue with ``drop`` and a non-unique index on Series (:issue:`5248`) + - Fixed seg fault in C parser caused by passing more names than columns in + the file. (:issue:`5156`) pandas 0.12.0 ------------- diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index b81feec6ab6f8..99a6c630e6ac4 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1955,6 +1955,15 @@ def test_integer_overflow_bug(self): result = self.read_csv(StringIO(data), header=None, sep='\s+') self.assertTrue(result[0].dtype == np.float64) + def test_catch_too_many_names(self): + # Issue 5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd']) + class TestPythonParser(ParserTests, unittest.TestCase): def test_negative_skipfooter_raises(self): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 8625038c57b23..06a1ddfdae025 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -801,7 +801,6 @@ cdef class TextReader: raise StopIteration self._end_clock('Tokenization') - self._start_clock() columns = self._convert_column_data(rows=rows, footer=footer, @@ -840,11 +839,12 @@ cdef class TextReader: def _convert_column_data(self, rows=None, upcast_na=False, footer=0): cdef: - Py_ssize_t i, nused, ncols + Py_ssize_t i, nused kh_str_t *na_hashset = NULL int start, end object name, na_flist bint na_filter = 0 + Py_ssize_t num_cols start = self.parser_start @@ -857,6 +857,22 @@ cdef class TextReader: # if footer > 0: # end -= footer + #print >> sys.stderr, self.table_width + #print >> sys.stderr, self.leading_cols + #print >> sys.stderr, self.parser.lines + #print >> sys.stderr, start + #print >> sys.stderr, end + #print >> sys.stderr, self.header + #print >> sys.stderr, "index" + num_cols = -1 + for i in range(self.parser.lines): + num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\ + (num_cols >= self.parser.line_fields[i]) * num_cols + + if self.table_width - self.leading_cols > num_cols: + raise CParserError("Too many columns specified: expected %s and found %s" % + (self.table_width - self.leading_cols, num_cols)) + results = {} nused = 0 for i in range(self.table_width): @@ -1446,7 +1462,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): word = COLITER_NEXT(it) - k = kh_get_str(na_hashset, word) # in the hash table if k != na_hashset.n_buckets: @@ -1828,16 +1843,6 @@ cdef _apply_converter(object f, parser_t *parser, int col, return lib.maybe_convert_objects(result) - # if issubclass(values.dtype.type, (np.number, np.bool_)): - # return values - - # # XXX - # na_values = set(['']) - # try: - # return lib.maybe_convert_numeric(values, na_values, False) - # except Exception: - # na_count = lib.sanitize_objects(values, na_values, False) - # return result def _to_structured_array(dict columns, object names): cdef: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 45b8b9263e9cd..da991ec23c373 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -709,7 +709,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit) if (c == '\n') { END_FIELD(); END_LINE(); - /* self->state = START_RECORD; */ } else if (c == '\r') { END_FIELD(); self->state = EAT_CRNL; diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 69f627dda554c..4e40d892a8b4a 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -161,7 +161,7 @@ typedef struct parser_t { int *line_start; // position in words for start of line int *line_fields; // Number of fields in each line - int lines; // Number of (good) lines observedb + int lines; // Number of (good) lines observed int file_lines; // Number of file lines observed (including bad or skipped) int lines_cap; // Vector capacity