Merge pull request #8834 from jreback/parser

jreback · jreback · commit 5e8ba36208cc · 2014-11-17T07:54:59.000-05:00
BUG: Bug in csv parsing when passing dtype and names and the parsed data is a different data type (GH8833)
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -64,6 +64,7 @@ Bug Fixes
 - Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
 - Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
 - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).
+- Unclear error message in csv parsing when passing dtype and names and the parsed data is a different data type (:issue:`8833`)
 - Bug in slicing a multi-index with an empty list and at least one boolean indexer (:issue:`8781`)
 - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo (:issue:`8761`).
 - ``Timedelta`` kwargs may now be numpy ints and floats (:issue:`8757`).
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -485,7 +485,6 @@ def test_index_col_named(self):
 
         h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
         data = h + no_header
-        # import pdb; pdb.set_trace()
         rs = self.read_csv(StringIO(data), index_col='ID')
         xp = self.read_csv(StringIO(data), header=0).set_index('ID')
         tm.assert_frame_equal(rs, xp)
@@ -2864,8 +2863,8 @@ def test_empty_lines(self):
     def test_whitespace_lines(self):
         data = """
 
-\t  \t\t 
-  \t  
+\t  \t\t
+  \t
 A,B,C
   \t    1,2.,4.
 5.,NaN,10.0
@@ -3110,8 +3109,8 @@ def test_empty_lines(self):
     def test_whitespace_lines(self):
         data = """
 
-\t  \t\t 
-  \t  
+\t  \t\t
+  \t
 A,B,C
   \t    1,2.,4.
 5.,NaN,10.0
@@ -3154,6 +3153,39 @@ def test_passing_dtype(self):
             self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
                               index_col=0)
 
+    def test_dtype_and_names_error(self):
+
+        # GH 8833
+        # passing both dtype and names resulting in an error reporting issue
+
+        data = """
+1.0 1
+2.0 2
+3.0 3
+"""
+        # base cases
+        result = self.read_csv(StringIO(data),sep='\s+',header=None)
+        expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]])
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'])
+        expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]],columns=['a','b'])
+        tm.assert_frame_equal(result, expected)
+
+        # fallback casting
+        result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int})
+        expected = DataFrame([[1,1],[2,2],[3,3]],columns=['a','b'])
+        tm.assert_frame_equal(result, expected)
+
+        data = """
+1.0 1
+nan 2
+3.0 3
+"""
+        # fallback casting, but not castable
+        with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'):
+            self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int})
+
     def test_fallback_to_python(self):
         # GH 6607
         data = 'a b c\n1 2 3'
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1002,8 +1002,12 @@ cdef class TextReader:
                     else:
                         col_dtype = np.dtype(col_dtype).str
 
-                return self._convert_with_dtype(col_dtype, i, start, end,
-                                                na_filter, 1, na_hashset, na_flist)
+                col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end,
+                                                             na_filter, 1, na_hashset, na_flist)
+
+                # fallback on the parse (e.g. we requested int dtype, but its actually a float)
+                if col_res is not None:
+                    return col_res, na_count
 
         if i in self.noconvert:
             return self._string_convert(i, start, end, na_filter, na_hashset)
@@ -1020,6 +1024,25 @@ cdef class TextReader:
                 if col_res is not None:
                     break
 
+        # we had a fallback parse on the dtype, so now try to cast
+        # only allow safe casts, eg. with a nan you cannot safely cast to int
+        if col_res is not None and col_dtype is not None:
+            try:
+                col_res = col_res.astype(col_dtype,casting='safe')
+            except TypeError:
+
+                # float -> int conversions can fail the above
+                # even with no nans
+                col_res_orig = col_res
+                col_res = col_res.astype(col_dtype)
+                if (col_res != col_res_orig).any():
+                    raise ValueError("cannot safely convert passed user dtype of "
+                                     "{col_dtype} for {col_res} dtyped data in "
+                                     "column {column}".format(col_dtype=col_dtype,
+                                                              col_res=col_res_orig.dtype.name,
+                                                              column=i))
+
+
         return col_res, na_count
 
     cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
@@ -1033,8 +1056,9 @@ cdef class TextReader:
         if dtype[1] == 'i' or dtype[1] == 'u':
             result, na_count = _try_int64(self.parser, i, start, end,
                                           na_filter, na_hashset)
-            if user_dtype and na_count > 0:
-                raise Exception('Integer column has NA values')
+            if user_dtype and na_count is not None:
+                if na_count > 0:
+                    raise Exception('Integer column has NA values')
 
             if dtype[1:] != 'i8':
                 result = result.astype(dtype)