-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
BUG: Fix inconsistent C engine quoting behaviour #13411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -202,10 +202,9 @@ | |
quotechar : str (length 1), optional | ||
The character used to denote the start and end of a quoted item. Quoted | ||
items can include the delimiter and it will be ignored. | ||
quoting : int or csv.QUOTE_* instance, default None | ||
quoting : int or csv.QUOTE_* instance, default 0 | ||
Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe advertise that these are in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, might as well. Will add. |
||
QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). | ||
Default (None) results in QUOTE_MINIMAL behavior. | ||
doublequote : boolean, default ``True`` | ||
When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate | ||
whether or not to interpret two consecutive quotechar elements INSIDE a | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
Tests that quoting specifications are properly handled | ||
during parsing for all of the parsers defined in parsers.py | ||
""" | ||
|
||
import csv | ||
import pandas.util.testing as tm | ||
|
||
from pandas import DataFrame | ||
from pandas.compat import StringIO | ||
|
||
|
||
class QuotingTests(object): | ||
|
||
def test_bad_quote_char(self): | ||
data = '1,2,3' | ||
|
||
# Python 2.x: "...must be an 1-character..." | ||
# Python 3.x: "...must be a 1-character..." | ||
msg = '"quotechar" must be a(n)? 1-character string' | ||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quotechar='foo') | ||
|
||
msg = 'quotechar must be set if quoting enabled' | ||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quotechar=None, | ||
quoting=csv.QUOTE_MINIMAL) | ||
|
||
msg = '"quotechar" must be string, not int' | ||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quotechar=2) | ||
|
||
def test_bad_quoting(self): | ||
data = '1,2,3' | ||
|
||
msg = '"quoting" must be an integer' | ||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quoting='foo') | ||
|
||
# quoting must in the range [0, 3] | ||
msg = 'bad "quoting" value' | ||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quoting=5) | ||
|
||
def test_quote_char_basic(self): | ||
data = 'a,b,c\n1,2,"cat"' | ||
expected = DataFrame([[1, 2, 'cat']], | ||
columns=['a', 'b', 'c']) | ||
result = self.read_csv(StringIO(data), quotechar='"') | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_quote_char_various(self): | ||
data = 'a,b,c\n1,2,"cat"' | ||
expected = DataFrame([[1, 2, 'cat']], | ||
columns=['a', 'b', 'c']) | ||
quote_chars = ['~', '*', '%', '$', '@', 'P'] | ||
|
||
for quote_char in quote_chars: | ||
new_data = data.replace('"', quote_char) | ||
result = self.read_csv(StringIO(new_data), quotechar=quote_char) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_null_quote_char(self): | ||
data = 'a,b,c\n1,2,3' | ||
|
||
# sanity checks | ||
msg = 'quotechar must be set if quoting enabled' | ||
|
||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quotechar=None, | ||
quoting=csv.QUOTE_MINIMAL) | ||
|
||
tm.assertRaisesRegexp(TypeError, msg, self.read_csv, | ||
StringIO(data), quotechar='', | ||
quoting=csv.QUOTE_MINIMAL) | ||
|
||
# no errors should be raised if quoting is None | ||
expected = DataFrame([[1, 2, 3]], | ||
columns=['a', 'b', 'c']) | ||
|
||
result = self.read_csv(StringIO(data), quotechar=None, | ||
quoting=csv.QUOTE_NONE) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
result = self.read_csv(StringIO(data), quotechar='', | ||
quoting=csv.QUOTE_NONE) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_quoting_various(self): | ||
data = '1,2,"foo"' | ||
cols = ['a', 'b', 'c'] | ||
|
||
# QUOTE_MINIMAL and QUOTE_ALL apply only to | ||
# the CSV writer, so they should have no | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should these be an error then? (or you are saying they don't do anything so might as well accept them)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely not an error. Just leave it alone is best. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But maybe we shouldn't mention them explicitly as options in documentation then? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well they are technically valid to Python's |
||
# special effect for the CSV reader | ||
expected = DataFrame([[1, 2, 'foo']], columns=cols) | ||
|
||
# test default (afterwards, arguments are all explicit) | ||
result = self.read_csv(StringIO(data), names=cols) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
result = self.read_csv(StringIO(data), quotechar='"', | ||
quoting=csv.QUOTE_MINIMAL, names=cols) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
result = self.read_csv(StringIO(data), quotechar='"', | ||
quoting=csv.QUOTE_ALL, names=cols) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# QUOTE_NONE tells the reader to do no special handling | ||
# of quote characters and leave them alone | ||
expected = DataFrame([[1, 2, '"foo"']], columns=cols) | ||
result = self.read_csv(StringIO(data), quotechar='"', | ||
quoting=csv.QUOTE_NONE, names=cols) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# QUOTE_NONNUMERIC tells the reader to cast | ||
# all non-quoted fields to float | ||
expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols) | ||
result = self.read_csv(StringIO(data), quotechar='"', | ||
quoting=csv.QUOTE_NONNUMERIC, | ||
names=cols) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_double_quote(self): | ||
data = 'a,b\n3,"4 "" 5"' | ||
|
||
expected = DataFrame([[3, '4 " 5']], | ||
columns=['a', 'b']) | ||
result = self.read_csv(StringIO(data), quotechar='"', | ||
doublequote=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = DataFrame([[3, '4 " 5"']], | ||
columns=['a', 'b']) | ||
result = self.read_csv(StringIO(data), quotechar='"', | ||
doublequote=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't really understand this one (I mean: reading the docs, I would expect There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not entirely sure from the Python engine perspective but from the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But why the quote after the 5? (so why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No it is not. The quote is considered in field. Follow along with the code and you'll see. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can see indeed the logic in the code. So after a closing quote, the rest of the field is regarded as a normal field, and cannot be a quoted field anymore, regardless of occurring quotes. Anyway, this is a rather pathological case, so not that important. I just found it a bit strange (and as you are adding the behaviour explicitly as a test, wanted to check it) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, okay. Thanks for clarifying! |
||
tm.assert_frame_equal(result, expected) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ from libc.string cimport strncpy, strlen, strcmp, strcasecmp | |
cimport libc.stdio as stdio | ||
import warnings | ||
|
||
from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE | ||
from cpython cimport (PyObject, PyBytes_FromString, | ||
PyBytes_AsString, PyBytes_Check, | ||
PyUnicode_Check, PyUnicode_AsUTF8String) | ||
|
@@ -283,6 +284,7 @@ cdef class TextReader: | |
object compression | ||
object mangle_dupe_cols | ||
object tupleize_cols | ||
list dtype_cast_order | ||
set noconvert, usecols | ||
|
||
def __cinit__(self, source, | ||
|
@@ -393,8 +395,13 @@ cdef class TextReader: | |
raise ValueError('Only length-1 escapes supported') | ||
self.parser.escapechar = ord(escapechar) | ||
|
||
self.parser.quotechar = ord(quotechar) | ||
self.parser.quoting = quoting | ||
self._set_quoting(quotechar, quoting) | ||
|
||
# TODO: endianness just a placeholder? | ||
if quoting == QUOTE_NONNUMERIC: | ||
self.dtype_cast_order = ['<f8', '<i8', '|b1', '|O8'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't you use somthing like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I c you got this from below. nvm then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Just saw your second comment - weird GitHub mistiming in which I missed your comment but it somehow was made before I made mine 😕 |
||
else: | ||
self.dtype_cast_order = ['<i8', '<f8', '|b1', '|O8'] | ||
|
||
if comment is not None: | ||
if len(comment) > 1: | ||
|
@@ -548,6 +555,29 @@ cdef class TextReader: | |
def set_error_bad_lines(self, int status): | ||
self.parser.error_bad_lines = status | ||
|
||
def _set_quoting(self, quote_char, quoting): | ||
if not isinstance(quoting, int): | ||
raise TypeError('"quoting" must be an integer') | ||
|
||
if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: | ||
raise TypeError('bad "quoting" value') | ||
|
||
if not isinstance(quote_char, (str, bytes)) and quote_char is not None: | ||
dtype = type(quote_char).__name__ | ||
raise TypeError('"quotechar" must be string, ' | ||
'not {dtype}'.format(dtype=dtype)) | ||
|
||
if quote_char is None or quote_char == '': | ||
if quoting != QUOTE_NONE: | ||
raise TypeError("quotechar must be set if quoting enabled") | ||
self.parser.quoting = quoting | ||
self.parser.quotechar = -1 | ||
elif len(quote_char) > 1: # 0-len case handled earlier | ||
raise TypeError('"quotechar" must be a 1-character string') | ||
else: | ||
self.parser.quoting = quoting | ||
self.parser.quotechar = ord(quote_char) | ||
|
||
cdef _make_skiprow_set(self): | ||
if isinstance(self.skiprows, (int, np.integer)): | ||
parser_set_skipfirstnrows(self.parser, self.skiprows) | ||
|
@@ -1066,7 +1096,7 @@ cdef class TextReader: | |
return self._string_convert(i, start, end, na_filter, na_hashset) | ||
else: | ||
col_res = None | ||
for dt in dtype_cast_order: | ||
for dt in self.dtype_cast_order: | ||
try: | ||
col_res, na_count = self._convert_with_dtype( | ||
dt, i, start, end, na_filter, 0, na_hashset, na_flist) | ||
|
@@ -1847,12 +1877,6 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: | |
return table | ||
|
||
|
||
# if at first you don't succeed... | ||
|
||
# TODO: endianness just a placeholder? | ||
cdef list dtype_cast_order = ['<i8', '<f8', '|b1', '|O8'] | ||
|
||
|
||
cdef raise_parser_error(object base, parser_t *parser): | ||
message = '%s. C error: ' % base | ||
if parser.error_msg != NULL: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isn't this still defaulted to
None
?, meaning don't consider quoting at all? (and NOTQUOTE_NONE
?) or are these the sameThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
quoting=0
None
raises an error (it always has been)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, you are essentially fixing the documentation, ok then