Skip to content

BUG: Fixed issue #5156: segfault on read_csv #5268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 19, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ Improvements to existing features
- ``Series`` now supports a ``to_frame`` method to convert it to a single-column DataFrame (:issue:`5164`)
- DatetimeIndex (and date_range) can now be constructed in a left- or
right-open fashion using the ``closed`` parameter (:issue:`4579`)
- Python csv parser now supports usecols (:issue:`4335`)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was requested here: #5211


API Changes
~~~~~~~~~~~
Expand Down Expand Up @@ -625,6 +626,8 @@ Bug Fixes
- Fixed bug in Excel writers where frames with duplicate column names weren't
written correctly. (:issue:`5235`)
- Fixed issue with ``drop`` and a non-unique index on Series (:issue:`5248`)
- Fixed seg fault in C parser caused by passing more names than columns in
the file. (:issue:`5156`)

pandas 0.12.0
-------------
Expand Down
9 changes: 9 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1955,6 +1955,15 @@ def test_integer_overflow_bug(self):
result = self.read_csv(StringIO(data), header=None, sep='\s+')
self.assertTrue(result[0].dtype == np.float64)

def test_catch_too_many_names(self):
# Issue 5156
data = """\
1,2,3
4,,6
7,8,9
10,11,12\n"""
tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd'])


class TestPythonParser(ParserTests, unittest.TestCase):
def test_negative_skipfooter_raises(self):
Expand Down
31 changes: 18 additions & 13 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,6 @@ cdef class TextReader:
raise StopIteration
self._end_clock('Tokenization')


self._start_clock()
columns = self._convert_column_data(rows=rows,
footer=footer,
Expand Down Expand Up @@ -840,11 +839,12 @@ cdef class TextReader:

def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
cdef:
Py_ssize_t i, nused, ncols
Py_ssize_t i, nused
kh_str_t *na_hashset = NULL
int start, end
object name, na_flist
bint na_filter = 0
Py_ssize_t num_cols

start = self.parser_start

Expand All @@ -857,6 +857,22 @@ cdef class TextReader:
# if footer > 0:
# end -= footer

#print >> sys.stderr, self.table_width
#print >> sys.stderr, self.leading_cols
#print >> sys.stderr, self.parser.lines
#print >> sys.stderr, start
#print >> sys.stderr, end
#print >> sys.stderr, self.header
#print >> sys.stderr, "index"
num_cols = -1
for i in range(self.parser.lines):
num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\
(num_cols >= self.parser.line_fields[i]) * num_cols

if self.table_width - self.leading_cols > num_cols:
raise CParserError("Too many columns specified: expected %s and found %s" %
(self.table_width - self.leading_cols, num_cols))

results = {}
nused = 0
for i in range(self.table_width):
Expand Down Expand Up @@ -1446,7 +1462,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
if na_filter:
for i in range(lines):
word = COLITER_NEXT(it)

k = kh_get_str(na_hashset, word)
# in the hash table
if k != na_hashset.n_buckets:
Expand Down Expand Up @@ -1828,16 +1843,6 @@ cdef _apply_converter(object f, parser_t *parser, int col,

return lib.maybe_convert_objects(result)

# if issubclass(values.dtype.type, (np.number, np.bool_)):
# return values

# # XXX
# na_values = set([''])
# try:
# return lib.maybe_convert_numeric(values, na_values, False)
# except Exception:
# na_count = lib.sanitize_objects(values, na_values, False)
# return result

def _to_structured_array(dict columns, object names):
cdef:
Expand Down
1 change: 0 additions & 1 deletion pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
if (c == '\n') {
END_FIELD();
END_LINE();
/* self->state = START_RECORD; */
} else if (c == '\r') {
END_FIELD();
self->state = EAT_CRNL;
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ typedef struct parser_t {

int *line_start; // position in words for start of line
int *line_fields; // Number of fields in each line
int lines; // Number of (good) lines observedb
int lines; // Number of (good) lines observed
int file_lines; // Number of file lines observed (including bad or skipped)
int lines_cap; // Vector capacity

Expand Down