Skip to content

BUG: Fix input bytes conversion in Py3 to return str #4783

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,9 @@ Bug Fixes
- Fixed bug with reading compressed files with ``read_fwf`` in Python 3.
(:issue:`3963`)
- Fixed an issue with a duplicate index and assignment with a dtype change (:issue:`4686`)
- Fixed bug with reading compressed files in as ``bytes`` rather than ``str``
in Python 3. Simplifies bytes-producing file-handling in Python 3
(:issue:`3963`, :issue:`4785`).

pandas 0.12.0
-------------
Expand Down
1 change: 1 addition & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import types

PY3 = (sys.version_info[0] >= 3)
PY3_2 = sys.version_info[:2] == (3, 2)

try:
import __builtin__ as builtins
Expand Down
38 changes: 27 additions & 11 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import codecs
import csv
import sys

from numpy.lib.format import read_array, write_array
import numpy as np
Expand Down Expand Up @@ -1858,27 +1859,42 @@ def next(self):


def _get_handle(path, mode, encoding=None, compression=None):
"""Gets file handle for given path and mode.
NOTE: Under Python 3.2, getting a compressed file handle means reading in the entire file,
decompressing it and decoding it to ``str`` all at once and then wrapping it in a StringIO.
"""
if compression is not None:
if encoding is not None:
raise ValueError('encoding + compression not yet supported')
if encoding is not None and not compat.PY3:
msg = 'encoding + compression not yet supported in Python 2'
raise ValueError(msg)

if compression == 'gzip':
import gzip
return gzip.GzipFile(path, 'rb')
f = gzip.GzipFile(path, 'rb')
elif compression == 'bz2':
import bz2
return bz2.BZ2File(path, 'rb')

f = bz2.BZ2File(path, 'rb')
else:
raise ValueError('Unrecognized compression type: %s' %
compression)

if compat.PY3: # pragma: no cover
if encoding:
f = open(path, mode, encoding=encoding)
else:
f = open(path, mode, errors='replace')
if compat.PY3_2:
# gzip and bz2 don't work with TextIOWrapper in 3.2
encoding = encoding or get_option('display.encoding')
f = StringIO(f.read().decode(encoding))
elif compat.PY3:
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
return f
else:
f = open(path, mode)
if compat.PY3:
if encoding:
f = open(path, mode, encoding=encoding)
else:
f = open(path, mode, errors='replace')
else:
f = open(path, mode)

return f

if compat.PY3: # pragma: no cover
Expand Down
45 changes: 34 additions & 11 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,13 +1175,36 @@ def count_empty_vals(vals):
return sum([1 for v in vals if v == '' or v is None])


def _wrap_compressed(f, compression):
def _wrap_compressed(f, compression, encoding=None):
"""wraps compressed fileobject in a decompressing fileobject
NOTE: For all files in Python 3.2 and for bzip'd files under all Python
versions, this means reading in the entire file and then re-wrapping it in
StringIO.
"""
compression = compression.lower()
encoding = encoding or get_option('display.encoding')
if compression == 'gzip':
import gzip
return gzip.GzipFile(fileobj=f)

f = gzip.GzipFile(fileobj=f)
if compat.PY3_2:
# 3.2's gzip doesn't support read1
f = StringIO(f.read().decode(encoding))
elif compat.PY3:
from io import TextIOWrapper

f = TextIOWrapper(f)
return f
elif compression == 'bz2':
raise ValueError('Python cannot read bz2 data from file handle')
import bz2

# bz2 module can't take file objects, so have to run through decompress
# manually
data = bz2.decompress(f.read())
if compat.PY3:
data = data.decode(encoding)
f = StringIO(data)
return f
else:
raise ValueError('do not recognize compression method %s'
% compression)
Expand Down Expand Up @@ -1235,7 +1258,12 @@ def __init__(self, f, **kwds):
f = com._get_handle(f, 'r', encoding=self.encoding,
compression=self.compression)
elif self.compression:
f = _wrap_compressed(f, self.compression)
f = _wrap_compressed(f, self.compression, self.encoding)
# in Python 3, convert BytesIO or fileobjects passed with an encoding
elif compat.PY3 and isinstance(f, compat.BytesIO):
from io import TextIOWrapper

f = TextIOWrapper(f, encoding=self.encoding)

if hasattr(f, 'readline'):
self._make_reader(f)
Expand Down Expand Up @@ -1321,14 +1349,9 @@ class MyDialect(csv.Dialect):
def _read():
line = next(f)
pat = re.compile(sep)
if (compat.PY3 and isinstance(line, bytes)):
yield pat.split(line.decode('utf-8').strip())
for line in f:
yield pat.split(line.decode('utf-8').strip())
else:
yield pat.split(line.strip())
for line in f:
yield pat.split(line.strip())
for line in f:
yield pat.split(line.strip())
reader = _read()

self.data = reader
Expand Down
17 changes: 15 additions & 2 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# pylint: disable=E1101

from datetime import datetime
Expand Down Expand Up @@ -2043,8 +2044,8 @@ def test_fwf_compression(self):
expected = read_fwf(StringIO(data), widths=widths, names=names)
if compat.PY3:
data = bytes(data, encoding='utf-8')
for comp_name, compresser in [('gzip', gzip.GzipFile),
('bz2', bz2.BZ2File)]:
comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
for comp_name, compresser in comps:
with tm.ensure_clean() as path:
tmp = compresser(path, mode='wb')
tmp.write(data)
Expand All @@ -2053,6 +2054,18 @@ def test_fwf_compression(self):
compression=comp_name)
tm.assert_frame_equal(result, expected)

def test_BytesIO_input(self):
if not compat.PY3:
raise nose.SkipTest("Bytes-related test - only needs to work on Python 3")
result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2])
expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"])
tm.assert_frame_equal(result, expected)
data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
result = pd.read_table(data, sep="::", engine='python',
encoding='cp1255')
expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"])
tm.assert_frame_equal(result, expected)

def test_verbose_import(self):
text = """a,b,c,d
one,1,2,3
Expand Down