Skip to content

BUG: Bug in csv parsing when passing dtype and names and the parsed data is a different data type (GH8833) #8834

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Bug Fixes
- Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
- Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).
- Unclear error message in csv parsing when passing dtype and names and the parsed data is a different data type (:issue:`8833`)
- Bug in slicing a multi-index with an empty list and at least one boolean indexer (:issue:`8781`)
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo (:issue:`8761`).
- ``Timedelta`` kwargs may now be numpy ints and floats (:issue:`8757`).
Expand Down
42 changes: 37 additions & 5 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,6 @@ def test_index_col_named(self):

h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
data = h + no_header
# import pdb; pdb.set_trace()
rs = self.read_csv(StringIO(data), index_col='ID')
xp = self.read_csv(StringIO(data), header=0).set_index('ID')
tm.assert_frame_equal(rs, xp)
Expand Down Expand Up @@ -2864,8 +2863,8 @@ def test_empty_lines(self):
def test_whitespace_lines(self):
data = """

\t \t\t
\t
\t \t\t
\t
A,B,C
\t 1,2.,4.
5.,NaN,10.0
Expand Down Expand Up @@ -3110,8 +3109,8 @@ def test_empty_lines(self):
def test_whitespace_lines(self):
data = """

\t \t\t
\t
\t \t\t
\t
A,B,C
\t 1,2.,4.
5.,NaN,10.0
Expand Down Expand Up @@ -3154,6 +3153,39 @@ def test_passing_dtype(self):
self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' },
index_col=0)

def test_dtype_and_names_error(self):

# GH 8833
# passing both dtype and names resulting in an error reporting issue

data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = self.read_csv(StringIO(data),sep='\s+',header=None)
expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]])
tm.assert_frame_equal(result, expected)

result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'])
expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]],columns=['a','b'])
tm.assert_frame_equal(result, expected)

# fallback casting
result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int})
expected = DataFrame([[1,1],[2,2],[3,3]],columns=['a','b'])
tm.assert_frame_equal(result, expected)

data = """
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @mrocklin
@cpcloud

this case actually raises internally (e.g. coercing a float -> int), but then I ignore and don't cast.

So should this raise? (basically the user is requesting an int, but cannot cast to it).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i think if you request an int when there's clearly a nan in the column then it should raise

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the other alternative is to ignore the user, but that doesn't seem very polite

1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'):
self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : int})

def test_fallback_to_python(self):
# GH 6607
data = 'a b c\n1 2 3'
Expand Down
32 changes: 28 additions & 4 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1002,8 +1002,12 @@ cdef class TextReader:
else:
col_dtype = np.dtype(col_dtype).str

return self._convert_with_dtype(col_dtype, i, start, end,
na_filter, 1, na_hashset, na_flist)
col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end,
na_filter, 1, na_hashset, na_flist)

# fallback on the parse (e.g. we requested int dtype, but its actually a float)
if col_res is not None:
return col_res, na_count

if i in self.noconvert:
return self._string_convert(i, start, end, na_filter, na_hashset)
Expand All @@ -1020,6 +1024,25 @@ cdef class TextReader:
if col_res is not None:
break

# we had a fallback parse on the dtype, so now try to cast
# only allow safe casts, eg. with a nan you cannot safely cast to int
if col_res is not None and col_dtype is not None:
try:
col_res = col_res.astype(col_dtype,casting='safe')
except TypeError:

# float -> int conversions can fail the above
# even with no nans
col_res_orig = col_res
col_res = col_res.astype(col_dtype)
if (col_res != col_res_orig).any():
raise ValueError("cannot safely convert passed user dtype of "
"{col_dtype} for {col_res} dtyped data in "
"column {column}".format(col_dtype=col_dtype,
col_res=col_res_orig.dtype.name,
column=i))


return col_res, na_count

cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
Expand All @@ -1033,8 +1056,9 @@ cdef class TextReader:
if dtype[1] == 'i' or dtype[1] == 'u':
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset)
if user_dtype and na_count > 0:
raise Exception('Integer column has NA values')
if user_dtype and na_count is not None:
if na_count > 0:
raise Exception('Integer column has NA values')

if dtype[1:] != 'i8':
result = result.astype(dtype)
Expand Down