From 8633d23f14fd2df4beec40e338c1bfea3a75eb2b Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Sun, 8 Sep 2013 23:11:08 -0400 Subject: [PATCH 1/2] BUG: Fix read_fwf with compressed files. `gzip` and `bz2` both now return `bytes` rather than `str` in Python 3, so need to check for bytes and decode as necessary. --- doc/source/release.rst | 2 ++ pandas/io/parsers.py | 19 ++++++++++++++----- pandas/io/tests/test_parsers.py | 25 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index f32ea44ed6242..53c50100072f9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -369,6 +369,8 @@ Bug Fixes - Bug in ``iloc`` with a slice index failing (:issue:`4771`) - Incorrect error message with no colspecs or width in ``read_fwf``. (:issue:`4774`) - Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, :issue:`4550`) + - Fixed bug with reading compressed files with ``read_fwf`` in Python 3. + (:issue:`3963`) pandas 0.12.0 ------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f05b0a676cde4..5ca0a498d1e07 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1937,11 +1937,20 @@ def __init__(self, f, colspecs, filler, thousands=None): isinstance(colspec[1], int) ): raise AssertionError() - def next(self): - line = next(self.f) - # Note: 'colspecs' is a sequence of half-open intervals. - return [line[fromm:to].strip(self.filler or ' ') - for (fromm, to) in self.colspecs] + if compat.PY3: + def next(self): + line = next(self.f) + if isinstance(line, bytes): + line = line.decode('utf-8') + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[fromm:to].strip(self.filler or ' ') + for (fromm, to) in self.colspecs] + else: + def next(self): + line = next(self.f) + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[fromm:to].strip(self.filler or ' ') + for (fromm, to) in self.colspecs] # Iterator protocol in Python 3 uses __next__() __next__ = next diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 9d751de6645ce..f872ddd793935 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2028,6 +2028,31 @@ def test_fwf_regression(self): res = df.loc[:,c] self.assert_(len(res)) + def test_fwf_compression(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest("Need gzip and bz2 to run this test") + + data = """1111111111 + 2222222222 + 3333333333""".strip() + widths = [5, 5] + names = ['one', 'two'] + expected = read_fwf(StringIO(data), widths=widths, names=names) + if compat.PY3: + data = bytes(data, encoding='utf-8') + for comp_name, compresser in [('gzip', gzip.GzipFile), + ('bz2', bz2.BZ2File)]: + with tm.ensure_clean() as path: + tmp = compresser(path, mode='wb') + tmp.write(data) + tmp.close() + result = read_fwf(path, widths=widths, names=names, + compression=comp_name) + tm.assert_frame_equal(result, expected) + def test_verbose_import(self): text = """a,b,c,d one,1,2,3 From c3dae2676de45b3391c1b71fbb7af58ff9a16108 Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Mon, 9 Sep 2013 01:37:53 -0400 Subject: [PATCH 2/2] Use passed encoding to decode bytes --- pandas/io/parsers.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5ca0a498d1e07..e1b09eb76415f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -14,6 +14,7 @@ from pandas.core.frame import DataFrame import datetime import pandas.core.common as com +from pandas.core.config import get_option from pandas import compat from pandas.io.date_converters import generic_parser from pandas.io.common import get_filepath_or_buffer @@ -1921,11 +1922,14 @@ class FixedWidthReader(object): """ A reader of fixed-width lines. """ - def __init__(self, f, colspecs, filler, thousands=None): + def __init__(self, f, colspecs, filler, thousands=None, encoding=None): self.f = f self.colspecs = colspecs self.filler = filler # Empty characters between fields. self.thousands = thousands + if encoding is None: + encoding = get_option('display.encoding') + self.encoding = encoding if not ( isinstance(colspecs, (tuple, list))): raise AssertionError() @@ -1941,7 +1945,7 @@ def __init__(self, f, colspecs, filler, thousands=None): def next(self): line = next(self.f) if isinstance(line, bytes): - line = line.decode('utf-8') + line = line.decode(self.encoding) # Note: 'colspecs' is a sequence of half-open intervals. return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs] @@ -1968,7 +1972,8 @@ def __init__(self, f, **kwds): PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): - self.data = FixedWidthReader(f, self.colspecs, self.delimiter) + self.data = FixedWidthReader(f, self.colspecs, self.delimiter, + encoding=self.encoding) ##### deprecations in 0.12 #####