diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index f1c5b0c854055..a3eb19d92d48a 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -77,6 +77,8 @@ Bug Fixes - Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) +- Bug causing strings containing an exponent but no decimal to be parsed as ints instead of floats in python csv parser. (:issue:`9565`) + - Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`) - Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7d52c6ad4cb3b..88bf3bc90324f 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -35,7 +35,7 @@ from numpy.testing.decorators import slow from numpy.testing import assert_array_equal -from pandas.parser import OverflowError, CParserError +import pandas.parser class ParserTests(object): @@ -1648,7 +1648,7 @@ def test_read_table_buglet_4x_multiindex(self): # Temporarily copied to TestPythonParser. # Here test that CParserError is raised: - with tm.assertRaises(CParserError): + with tm.assertRaises(pandas.parser.CParserError): text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 @@ -2293,6 +2293,46 @@ def test_chunk_begins_with_newline_whitespace(self): result = self.read_csv(StringIO(data), header=None) self.assertEqual(len(result), 2) + def test_float_parser(self): + # GH 9565 + data = '45e-1,4.5,45.,inf,-inf' + result = self.read_csv(StringIO(data), header=None) + expected = pd.DataFrame([[float(s) for s in data.split(',')]]) + tm.assert_frame_equal(result, expected) + + def test_int64_overflow(self): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + + result = self.read_csv(StringIO(data)) + self.assertTrue(result['ID'].dtype == object) + + self.assertRaises((OverflowError, pandas.parser.OverflowError), + self.read_csv, StringIO(data), + converters={'ID' : np.int64}) + + # Just inside int64 range: parse as integer + i_max = np.iinfo(np.int64).max + i_min = np.iinfo(np.int64).min + for x in [i_max, i_min]: + result = pd.read_csv(StringIO(str(x)), header=None) + expected = pd.DataFrame([x]) + tm.assert_frame_equal(result, expected) + + # Just outside int64 range: parse as string + too_big = i_max + 1 + too_small = i_min - 1 + for x in [too_big, too_small]: + result = pd.read_csv(StringIO(str(x)), header=None) + expected = pd.DataFrame([str(x)]) + tm.assert_frame_equal(result, expected) + class TestPythonParser(ParserTests, tm.TestCase): def test_negative_skipfooter_raises(self): @@ -3567,22 +3607,6 @@ def test_disable_bool_parsing(self): result = read_csv(StringIO(data), dtype=object, na_filter=False) self.assertEqual(result['B'][2], '') - def test_int64_overflow(self): - data = """ID -00013007854817840016671868 -00013007854817840016749251 -00013007854817840016754630 -00013007854817840016781876 -00013007854817840017028824 -00013007854817840017963235 -00013007854817840018860166""" - - result = read_csv(StringIO(data)) - self.assertTrue(result['ID'].dtype == object) - - self.assertRaises(OverflowError, read_csv, StringIO(data), - dtype='i8') - def test_euro_decimal_format(self): data = """Id;Number1;Number2;Text1;Text2;Number3 1;1521,1541;187101,9543;ABC;poi;4,738797819 diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 55d5e37fc19ee..9ee5a753af567 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -514,11 +514,10 @@ def is_period_array(ndarray[object] values): cdef extern from "parse_helper.h": - inline int floatify(object, double *result) except -1 - -cdef double fINT64_MAX = INT64_MAX -cdef double fINT64_MIN = INT64_MIN + inline int floatify(object, double *result, int *maybe_int) except -1 +cdef int64_t iINT64_MAX = INT64_MAX +cdef int64_t iINT64_MIN = INT64_MIN def maybe_convert_numeric(object[:] values, set na_values, bint convert_empty=True, bint coerce_numeric=False): @@ -527,7 +526,7 @@ def maybe_convert_numeric(object[:] values, set na_values, convert to proper dtype array ''' cdef: - int status + int status, maybe_int Py_ssize_t i, n = values.size ndarray[float64_t] floats = np.empty(n, dtype='f8') ndarray[complex128_t] complexes = np.empty(n, dtype='c16') @@ -569,18 +568,16 @@ def maybe_convert_numeric(object[:] values, set na_values, seen_complex = True else: try: - status = floatify(val, &fval) + status = floatify(val, &fval, &maybe_int) floats[i] = fval if not seen_float: - if '.' in val or fval == INF or fval == NEGINF: - seen_float = True - elif 'inf' in val: # special case to handle +/-inf - seen_float = True - elif fval < fINT64_MAX and fval > fINT64_MIN: - try: - ints[i] = int(val) - except ValueError: - ints[i] = fval + if maybe_int: + as_int = int(val) + + if as_int <= iINT64_MAX and as_int >= iINT64_MIN: + ints[i] = as_int + else: + raise ValueError('integer out of range') else: seen_float = True except: diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index 763cbc03a9cbe..2769f67fcf521 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -2,13 +2,13 @@ #include static double xstrtod(const char *p, char **q, char decimal, char sci, - int skip_trailing); + int skip_trailing, int *maybe_int); -int to_double(char *item, double *p_value, char sci, char decimal) +int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int) { char *p_end; - *p_value = xstrtod(item, &p_end, decimal, sci, 1); + *p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int); return (errno == 0) && (!*p_end); } @@ -18,7 +18,7 @@ int to_double(char *item, double *p_value, char sci, char decimal) #define PyBytes_AS_STRING PyString_AS_STRING #endif -int floatify(PyObject* str, double *result) { +int floatify(PyObject* str, double *result, int *maybe_int) { int status; char *data; PyObject* tmp = NULL; @@ -35,14 +35,16 @@ int floatify(PyObject* str, double *result) { return -1; } - status = to_double(data, result, sci, dec); + status = to_double(data, result, sci, dec, maybe_int); if (!status) { /* handle inf/-inf */ if (0 == strcmp(data, "-inf")) { *result = -HUGE_VAL; + *maybe_int = 0; } else if (0 == strcmp(data, "inf")) { *result = HUGE_VAL; + *maybe_int = 0; } else { PyErr_SetString(PyExc_ValueError, "Unable to parse string"); Py_XDECREF(tmp); @@ -117,7 +119,7 @@ PANDAS_INLINE void uppercase(char *p) { static double xstrtod(const char *str, char **endptr, char decimal, - char sci, int skip_trailing) + char sci, int skip_trailing, int *maybe_int) { double number; int exponent; @@ -129,6 +131,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, int num_decimals; errno = 0; + *maybe_int = 1; // Skip leading whitespace while (isspace(*p)) p++; @@ -157,6 +160,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, // Process decimal part if (*p == decimal) { + *maybe_int = 0; p++; while (isdigit(*p)) @@ -182,6 +186,8 @@ static double xstrtod(const char *str, char **endptr, char decimal, // Process an exponent string if (toupper(*p) == toupper(sci)) { + *maybe_int = 0; + // Handle optional sign negative = 0; switch (*++p)