diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 11d2fab464d1f..59a1a5f063f3a 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -788,3 +788,4 @@ Bug Fixes - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) +- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bedf21318aa83..090d826a5c085 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1871,6 +1871,10 @@ class MyDialect(csv.Dialect): else: def _read(): line = f.readline() + + if compat.PY2 and self.encoding: + line = line.decode(self.encoding) + pat = re.compile(sep) yield pat.split(line.strip()) for line in f: diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 0408401672a2f..ad81dbb9f6e0f 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -201,3 +201,19 @@ def test_skipfooter_with_decimal(self): result = self.read_csv(StringIO(data), names=['a'], decimal='#', skipfooter=1) tm.assert_frame_equal(result, expected) + + def test_encoding_non_utf8_multichar_sep(self): + # see gh-3404 + expected = DataFrame({'a': [1], 'b': [2]}) + + for sep in ['::', '#####', '!!!', '123', '#1!c5', + '%!c!d', '@@#4:2', '_!pd#_']: + data = '1' + sep + '2' + + for encoding in ['utf-16', 'utf-16-be', 'utf-16-le', + 'utf-32', 'cp037']: + encoded_data = data.encode(encoding) + result = self.read_csv(BytesIO(encoded_data), + sep=sep, names=['a', 'b'], + encoding=encoding) + tm.assert_frame_equal(result, expected)