diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 3f72d5d44f870..b51da47563b1b 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -64,6 +64,7 @@ Bug Fixes - Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`) - Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`). - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`). +- Unclear error message in csv parsing when passing dtype and names and the parsed data is a different data type (:issue:`8833`) - Bug in slicing a multi-index with an empty list and at least one boolean indexer (:issue:`8781`) - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo (:issue:`8761`). - ``Timedelta`` kwargs may now be numpy ints and floats (:issue:`8757`). diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 8440d45ffb4be..7b8bdeb1f38b8 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -485,7 +485,6 @@ def test_index_col_named(self): h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" data = h + no_header - # import pdb; pdb.set_trace() rs = self.read_csv(StringIO(data), index_col='ID') xp = self.read_csv(StringIO(data), header=0).set_index('ID') tm.assert_frame_equal(rs, xp) @@ -2864,8 +2863,8 @@ def test_empty_lines(self): def test_whitespace_lines(self): data = """ -\t \t\t - \t +\t \t\t + \t A,B,C \t 1,2.,4. 5.,NaN,10.0 @@ -3110,8 +3109,8 @@ def test_empty_lines(self): def test_whitespace_lines(self): data = """ -\t \t\t - \t +\t \t\t + \t A,B,C \t 1,2.,4. 5.,NaN,10.0 @@ -3154,6 +3153,39 @@ def test_passing_dtype(self): self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, index_col=0) + def test_dtype_and_names_error(self): + + # GH 8833 + # passing both dtype and names resulting in an error reporting issue + + data = """ +1.0 1 +2.0 2 +3.0 3 +""" + # base cases + result = self.read_csv(StringIO(data),sep='\s+',header=None) + expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]]) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b']) + expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]],columns=['a','b']) + tm.assert_frame_equal(result, expected) + + # fallback casting + result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int}) + expected = DataFrame([[1,1],[2,2],[3,3]],columns=['a','b']) + tm.assert_frame_equal(result, expected) + + data = """ +1.0 1 +nan 2 +3.0 3 +""" + # fallback casting, but not castable + with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'): + self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int}) + def test_fallback_to_python(self): # GH 6607 data = 'a b c\n1 2 3' diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 5f56bd312b9a3..eb80a51728765 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1002,8 +1002,12 @@ cdef class TextReader: else: col_dtype = np.dtype(col_dtype).str - return self._convert_with_dtype(col_dtype, i, start, end, - na_filter, 1, na_hashset, na_flist) + col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, + na_filter, 1, na_hashset, na_flist) + + # fallback on the parse (e.g. we requested int dtype, but its actually a float) + if col_res is not None: + return col_res, na_count if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset) @@ -1020,6 +1024,25 @@ cdef class TextReader: if col_res is not None: break + # we had a fallback parse on the dtype, so now try to cast + # only allow safe casts, eg. with a nan you cannot safely cast to int + if col_res is not None and col_dtype is not None: + try: + col_res = col_res.astype(col_dtype,casting='safe') + except TypeError: + + # float -> int conversions can fail the above + # even with no nans + col_res_orig = col_res + col_res = col_res.astype(col_dtype) + if (col_res != col_res_orig).any(): + raise ValueError("cannot safely convert passed user dtype of " + "{col_dtype} for {col_res} dtyped data in " + "column {column}".format(col_dtype=col_dtype, + col_res=col_res_orig.dtype.name, + column=i)) + + return col_res, na_count cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, @@ -1033,8 +1056,9 @@ cdef class TextReader: if dtype[1] == 'i' or dtype[1] == 'u': result, na_count = _try_int64(self.parser, i, start, end, na_filter, na_hashset) - if user_dtype and na_count > 0: - raise Exception('Integer column has NA values') + if user_dtype and na_count is not None: + if na_count > 0: + raise Exception('Integer column has NA values') if dtype[1:] != 'i8': result = result.astype(dtype)