Merge pull request #4784 from jtratner/fix-read-fwf-with-compression

jtratner · jtratner · commit 53eec0898119 · 2013-09-09T05:27:11.000-07:00
BUG: Fix read_fwf with compressed files.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -369,6 +369,8 @@ Bug Fixes
   - Bug in ``iloc`` with a slice index failing (:issue:`4771`)
   - Incorrect error message with no colspecs or width in ``read_fwf``. (:issue:`4774`)
   - Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, :issue:`4550`)
+  - Fixed bug with reading compressed files with ``read_fwf`` in Python 3.
+    (:issue:`3963`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -14,6 +14,7 @@
 from pandas.core.frame import DataFrame
 import datetime
 import pandas.core.common as com
+from pandas.core.config import get_option
 from pandas import compat
 from pandas.io.date_converters import generic_parser
 from pandas.io.common import get_filepath_or_buffer
@@ -1921,11 +1922,14 @@ class FixedWidthReader(object):
     """
     A reader of fixed-width lines.
     """
-    def __init__(self, f, colspecs, filler, thousands=None):
+    def __init__(self, f, colspecs, filler, thousands=None, encoding=None):
         self.f = f
         self.colspecs = colspecs
         self.filler = filler  # Empty characters between fields.
         self.thousands = thousands
+        if encoding is None:
+            encoding = get_option('display.encoding')
+        self.encoding = encoding
 
         if not ( isinstance(colspecs, (tuple, list))):
             raise AssertionError()
@@ -1937,11 +1941,20 @@ def __init__(self, f, colspecs, filler, thousands=None):
                        isinstance(colspec[1], int) ):
                 raise AssertionError()
 
-    def next(self):
-        line = next(self.f)
-        # Note: 'colspecs' is a sequence of half-open intervals.
-        return [line[fromm:to].strip(self.filler or ' ')
-                for (fromm, to) in self.colspecs]
+    if compat.PY3:
+        def next(self):
+            line = next(self.f)
+            if isinstance(line, bytes):
+                line = line.decode(self.encoding)
+            # Note: 'colspecs' is a sequence of half-open intervals.
+            return [line[fromm:to].strip(self.filler or ' ')
+                    for (fromm, to) in self.colspecs]
+    else:
+        def next(self):
+            line = next(self.f)
+            # Note: 'colspecs' is a sequence of half-open intervals.
+            return [line[fromm:to].strip(self.filler or ' ')
+                    for (fromm, to) in self.colspecs]
 
     # Iterator protocol in Python 3 uses __next__()
     __next__ = next
@@ -1959,7 +1972,8 @@ def __init__(self, f, **kwds):
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f):
-        self.data = FixedWidthReader(f, self.colspecs, self.delimiter)
+        self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
+                                     encoding=self.encoding)
 
 
 ##### deprecations in 0.12 #####
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2028,6 +2028,31 @@ def test_fwf_regression(self):
             res = df.loc[:,c]
             self.assert_(len(res))
 
+    def test_fwf_compression(self):
+        try:
+            import gzip
+            import bz2
+        except ImportError:
+            raise nose.SkipTest("Need gzip and bz2 to run this test")
+
+        data = """1111111111
+        2222222222
+        3333333333""".strip()
+        widths = [5, 5]
+        names = ['one', 'two']
+        expected = read_fwf(StringIO(data), widths=widths, names=names)
+        if compat.PY3:
+            data = bytes(data, encoding='utf-8')
+        for comp_name, compresser in [('gzip', gzip.GzipFile),
+                                      ('bz2', bz2.BZ2File)]:
+            with tm.ensure_clean() as path:
+                tmp = compresser(path, mode='wb')
+                tmp.write(data)
+                tmp.close()
+                result = read_fwf(path, widths=widths, names=names,
+                                  compression=comp_name)
+                tm.assert_frame_equal(result, expected)
+
     def test_verbose_import(self):
         text = """a,b,c,d
 one,1,2,3