diff --git a/doc/source/release.rst b/doc/source/release.rst index 140c3bc836fdb..124661021f45c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -393,6 +393,9 @@ Bug Fixes - Fixed bug with reading compressed files with ``read_fwf`` in Python 3. (:issue:`3963`) - Fixed an issue with a duplicate index and assignment with a dtype change (:issue:`4686`) + - Fixed bug with reading compressed files in as ``bytes`` rather than ``str`` + in Python 3. Simplifies bytes-producing file-handling in Python 3 + (:issue:`3963`, :issue:`4785`). pandas 0.12.0 ------------- diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 1b5939eb98417..12c929cd59820 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -36,6 +36,7 @@ import types PY3 = (sys.version_info[0] >= 3) +PY3_2 = sys.version_info[:2] == (3, 2) try: import __builtin__ as builtins diff --git a/pandas/core/common.py b/pandas/core/common.py index b58bd92a4fd1f..34aaa08b57171 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,6 +5,7 @@ import re import codecs import csv +import sys from numpy.lib.format import read_array, write_array import numpy as np @@ -1858,27 +1859,42 @@ def next(self): def _get_handle(path, mode, encoding=None, compression=None): + """Gets file handle for given path and mode. + NOTE: Under Python 3.2, getting a compressed file handle means reading in the entire file, + decompressing it and decoding it to ``str`` all at once and then wrapping it in a StringIO. + """ if compression is not None: - if encoding is not None: - raise ValueError('encoding + compression not yet supported') + if encoding is not None and not compat.PY3: + msg = 'encoding + compression not yet supported in Python 2' + raise ValueError(msg) if compression == 'gzip': import gzip - return gzip.GzipFile(path, 'rb') + f = gzip.GzipFile(path, 'rb') elif compression == 'bz2': import bz2 - return bz2.BZ2File(path, 'rb') + + f = bz2.BZ2File(path, 'rb') else: raise ValueError('Unrecognized compression type: %s' % compression) - - if compat.PY3: # pragma: no cover - if encoding: - f = open(path, mode, encoding=encoding) - else: - f = open(path, mode, errors='replace') + if compat.PY3_2: + # gzip and bz2 don't work with TextIOWrapper in 3.2 + encoding = encoding or get_option('display.encoding') + f = StringIO(f.read().decode(encoding)) + elif compat.PY3: + from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + return f else: - f = open(path, mode) + if compat.PY3: + if encoding: + f = open(path, mode, encoding=encoding) + else: + f = open(path, mode, errors='replace') + else: + f = open(path, mode) + return f if compat.PY3: # pragma: no cover diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 06940e3bb2b4c..5554bef4acf98 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1175,13 +1175,36 @@ def count_empty_vals(vals): return sum([1 for v in vals if v == '' or v is None]) -def _wrap_compressed(f, compression): +def _wrap_compressed(f, compression, encoding=None): + """wraps compressed fileobject in a decompressing fileobject + NOTE: For all files in Python 3.2 and for bzip'd files under all Python + versions, this means reading in the entire file and then re-wrapping it in + StringIO. + """ compression = compression.lower() + encoding = encoding or get_option('display.encoding') if compression == 'gzip': import gzip - return gzip.GzipFile(fileobj=f) + + f = gzip.GzipFile(fileobj=f) + if compat.PY3_2: + # 3.2's gzip doesn't support read1 + f = StringIO(f.read().decode(encoding)) + elif compat.PY3: + from io import TextIOWrapper + + f = TextIOWrapper(f) + return f elif compression == 'bz2': - raise ValueError('Python cannot read bz2 data from file handle') + import bz2 + + # bz2 module can't take file objects, so have to run through decompress + # manually + data = bz2.decompress(f.read()) + if compat.PY3: + data = data.decode(encoding) + f = StringIO(data) + return f else: raise ValueError('do not recognize compression method %s' % compression) @@ -1235,7 +1258,12 @@ def __init__(self, f, **kwds): f = com._get_handle(f, 'r', encoding=self.encoding, compression=self.compression) elif self.compression: - f = _wrap_compressed(f, self.compression) + f = _wrap_compressed(f, self.compression, self.encoding) + # in Python 3, convert BytesIO or fileobjects passed with an encoding + elif compat.PY3 and isinstance(f, compat.BytesIO): + from io import TextIOWrapper + + f = TextIOWrapper(f, encoding=self.encoding) if hasattr(f, 'readline'): self._make_reader(f) @@ -1321,14 +1349,9 @@ class MyDialect(csv.Dialect): def _read(): line = next(f) pat = re.compile(sep) - if (compat.PY3 and isinstance(line, bytes)): - yield pat.split(line.decode('utf-8').strip()) - for line in f: - yield pat.split(line.decode('utf-8').strip()) - else: + yield pat.split(line.strip()) + for line in f: yield pat.split(line.strip()) - for line in f: - yield pat.split(line.strip()) reader = _read() self.data = reader diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index f872ddd793935..fb2b3fdd33bf1 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # pylint: disable=E1101 from datetime import datetime @@ -2043,8 +2044,8 @@ def test_fwf_compression(self): expected = read_fwf(StringIO(data), widths=widths, names=names) if compat.PY3: data = bytes(data, encoding='utf-8') - for comp_name, compresser in [('gzip', gzip.GzipFile), - ('bz2', bz2.BZ2File)]: + comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)] + for comp_name, compresser in comps: with tm.ensure_clean() as path: tmp = compresser(path, mode='wb') tmp.write(data) @@ -2053,6 +2054,18 @@ def test_fwf_compression(self): compression=comp_name) tm.assert_frame_equal(result, expected) + def test_BytesIO_input(self): + if not compat.PY3: + raise nose.SkipTest("Bytes-related test - only needs to work on Python 3") + result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2]) + expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"]) + tm.assert_frame_equal(result, expected) + data = BytesIO("שלום::1234\n562::123".encode('cp1255')) + result = pd.read_table(data, sep="::", engine='python', + encoding='cp1255') + expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"]) + tm.assert_frame_equal(result, expected) + def test_verbose_import(self): text = """a,b,c,d one,1,2,3