Skip to content

Commit 5e8ba36

Browse files
committed
Merge pull request #8834 from jreback/parser
BUG: Bug in csv parsing when passing dtype and names and the parsed data is a different data type (GH8833)
2 parents 9dd675b + 943ca30 commit 5e8ba36

File tree

3 files changed

+66
-9
lines changed

3 files changed

+66
-9
lines changed

doc/source/whatsnew/v0.15.2.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ Bug Fixes
6464
- Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
6565
- Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
6666
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).
67+
- Unclear error message in csv parsing when passing dtype and names and the parsed data is a different data type (:issue:`8833`)
6768
- Bug in slicing a multi-index with an empty list and at least one boolean indexer (:issue:`8781`)
6869
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo (:issue:`8761`).
6970
- ``Timedelta`` kwargs may now be numpy ints and floats (:issue:`8757`).

pandas/io/tests/test_parsers.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,6 @@ def test_index_col_named(self):
485485

486486
h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
487487
data = h + no_header
488-
# import pdb; pdb.set_trace()
489488
rs = self.read_csv(StringIO(data), index_col='ID')
490489
xp = self.read_csv(StringIO(data), header=0).set_index('ID')
491490
tm.assert_frame_equal(rs, xp)
@@ -2864,8 +2863,8 @@ def test_empty_lines(self):
28642863
def test_whitespace_lines(self):
28652864
data = """
28662865
2867-
\t \t\t
2868-
\t
2866+
\t \t\t
2867+
\t
28692868
A,B,C
28702869
\t 1,2.,4.
28712870
5.,NaN,10.0
@@ -3110,8 +3109,8 @@ def test_empty_lines(self):
31103109
def test_whitespace_lines(self):
31113110
data = """
31123111
3113-
\t \t\t
3114-
\t
3112+
\t \t\t
3113+
\t
31153114
A,B,C
31163115
\t 1,2.,4.
31173116
5.,NaN,10.0
@@ -3154,6 +3153,39 @@ def test_passing_dtype(self):
31543153
self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
31553154
index_col=0)
31563155

3156+
def test_dtype_and_names_error(self):
3157+
3158+
# GH 8833
3159+
# passing both dtype and names resulting in an error reporting issue
3160+
3161+
data = """
3162+
1.0 1
3163+
2.0 2
3164+
3.0 3
3165+
"""
3166+
# base cases
3167+
result = self.read_csv(StringIO(data),sep='\s+',header=None)
3168+
expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]])
3169+
tm.assert_frame_equal(result, expected)
3170+
3171+
result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'])
3172+
expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]],columns=['a','b'])
3173+
tm.assert_frame_equal(result, expected)
3174+
3175+
# fallback casting
3176+
result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int})
3177+
expected = DataFrame([[1,1],[2,2],[3,3]],columns=['a','b'])
3178+
tm.assert_frame_equal(result, expected)
3179+
3180+
data = """
3181+
1.0 1
3182+
nan 2
3183+
3.0 3
3184+
"""
3185+
# fallback casting, but not castable
3186+
with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'):
3187+
self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int})
3188+
31573189
def test_fallback_to_python(self):
31583190
# GH 6607
31593191
data = 'a b c\n1 2 3'

pandas/parser.pyx

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,8 +1002,12 @@ cdef class TextReader:
10021002
else:
10031003
col_dtype = np.dtype(col_dtype).str
10041004

1005-
return self._convert_with_dtype(col_dtype, i, start, end,
1006-
na_filter, 1, na_hashset, na_flist)
1005+
col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end,
1006+
na_filter, 1, na_hashset, na_flist)
1007+
1008+
# fallback on the parse (e.g. we requested int dtype, but its actually a float)
1009+
if col_res is not None:
1010+
return col_res, na_count
10071011

10081012
if i in self.noconvert:
10091013
return self._string_convert(i, start, end, na_filter, na_hashset)
@@ -1020,6 +1024,25 @@ cdef class TextReader:
10201024
if col_res is not None:
10211025
break
10221026

1027+
# we had a fallback parse on the dtype, so now try to cast
1028+
# only allow safe casts, eg. with a nan you cannot safely cast to int
1029+
if col_res is not None and col_dtype is not None:
1030+
try:
1031+
col_res = col_res.astype(col_dtype,casting='safe')
1032+
except TypeError:
1033+
1034+
# float -> int conversions can fail the above
1035+
# even with no nans
1036+
col_res_orig = col_res
1037+
col_res = col_res.astype(col_dtype)
1038+
if (col_res != col_res_orig).any():
1039+
raise ValueError("cannot safely convert passed user dtype of "
1040+
"{col_dtype} for {col_res} dtyped data in "
1041+
"column {column}".format(col_dtype=col_dtype,
1042+
col_res=col_res_orig.dtype.name,
1043+
column=i))
1044+
1045+
10231046
return col_res, na_count
10241047

10251048
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
@@ -1033,8 +1056,9 @@ cdef class TextReader:
10331056
if dtype[1] == 'i' or dtype[1] == 'u':
10341057
result, na_count = _try_int64(self.parser, i, start, end,
10351058
na_filter, na_hashset)
1036-
if user_dtype and na_count > 0:
1037-
raise Exception('Integer column has NA values')
1059+
if user_dtype and na_count is not None:
1060+
if na_count > 0:
1061+
raise Exception('Integer column has NA values')
10381062

10391063
if dtype[1:] != 'i8':
10401064
result = result.astype(dtype)

0 commit comments

Comments
 (0)