Skip to content

BUG: Segfault due to float_precision='round_trip' #15148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -437,3 +437,5 @@ Bug Fixes

- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`)

- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
7 changes: 7 additions & 0 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,10 @@ def test_read_nrows_large(self):
df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)

self.assertTrue(df.size == 1010 * 10)

def test_float_precision_round_trip_with_text(self):
# gh-15140 - This should not segfault on Python 2.7+
df = self.read_csv(StringIO('a'),
float_precision='round_trip',
header=None)
tm.assert_frame_equal(df, DataFrame({0: ['a']}))
43 changes: 32 additions & 11 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,11 @@ cdef extern from "parser/tokenizer.h":
PyObject *skipfunc
int64_t skip_first_N_rows
int skipfooter
double (*converter)(const char *, char **, char, char, char, int) nogil
# pick one, depending on whether the converter requires GIL
double (*double_converter_nogil)(const char *, char **,
char, char, char, int) nogil
double (*double_converter_withgil)(const char *, char **,
char, char, char, int)

# error handling
char *warn_msg
Expand Down Expand Up @@ -482,11 +486,14 @@ cdef class TextReader:

self.verbose = verbose
self.low_memory = low_memory
self.parser.converter = xstrtod
self.parser.double_converter_nogil = xstrtod
self.parser.double_converter_withgil = NULL
if float_precision == 'high':
self.parser.converter = precise_xstrtod
elif float_precision == 'round_trip':
self.parser.converter = round_trip
self.parser.double_converter_nogil = precise_xstrtod
self.parser.double_converter_withgil = NULL
elif float_precision == 'round_trip': # avoid gh-15140
self.parser.double_converter_nogil = NULL
self.parser.double_converter_withgil = round_trip

# encoding
if encoding is not None:
Expand Down Expand Up @@ -1699,17 +1706,31 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
result = np.empty(lines, dtype=np.float64)
data = <double *> result.data
na_fset = kset_float64_from_list(na_flist)
with nogil:
error = _try_double_nogil(parser, col, line_start, line_end,
if parser.double_converter_nogil != NULL: # if it can run without the GIL
with nogil:
error = _try_double_nogil(parser, parser.double_converter_nogil,
col, line_start, line_end,
na_filter, na_hashset, use_na_flist,
na_fset, NA, data, &na_count)
else:
assert parser.double_converter_withgil != NULL
error = _try_double_nogil(parser, <double (*)(
const char *, char **,
char, char, char, int)
nogil>parser.double_converter_withgil,
col, line_start, line_end,
na_filter, na_hashset, use_na_flist,
na_fset, NA, data, &na_count)
kh_destroy_float64(na_fset)
if error != 0:
return None, None
return result, na_count

cdef inline int _try_double_nogil(parser_t *parser, int col,
int line_start, int line_end,
cdef inline int _try_double_nogil(parser_t *parser,
double (*double_converter)(
const char *, char **, char,
char, char, int) nogil,
int col, int line_start, int line_end,
bint na_filter, kh_str_t *na_hashset,
bint use_na_flist,
const kh_float64_t *na_flist,
Expand Down Expand Up @@ -1739,7 +1760,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,
na_count[0] += 1
data[0] = NA
else:
data[0] = parser.converter(word, &p_end, parser.decimal,
data[0] = double_converter(word, &p_end, parser.decimal,
parser.sci, parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
if (strcasecmp(word, cinf) == 0 or
Expand All @@ -1760,7 +1781,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,
else:
for i in range(lines):
COLITER_NEXT(it, word)
data[0] = parser.converter(word, &p_end, parser.decimal,
data[0] = double_converter(word, &p_end, parser.decimal,
parser.sci, parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
if (strcasecmp(word, cinf) == 0 or
Expand Down
8 changes: 3 additions & 5 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1773,11 +1773,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,

double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing) {
#if PY_VERSION_HEX >= 0x02070000
return PyOS_string_to_double(p, q, 0);
#else
return strtod(p, q);
#endif
double r = PyOS_string_to_double(p, q, 0);
PyErr_Clear();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in fact the entire ifdef can be removed here, we only support >= 2.7

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does the test cover the error condition here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does. An exception would otherwise occur in test_float_precision_round_trip_with_text.

return r;
}

// End of xstrtod code
Expand Down
6 changes: 5 additions & 1 deletion pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,11 @@ typedef struct parser_t {
PyObject *skipfunc;
int64_t skip_first_N_rows;
int skip_footer;
double (*converter)(const char *, char **, char, char, char, int);
// pick one, depending on whether the converter requires GIL
double (*double_converter_nogil)(const char *, char **,
char, char, char, int);
double (*double_converter_withgil)(const char *, char **,
char, char, char, int);

// error handling
char *warn_msg;
Expand Down