Skip to content

Commit 53eec08

Browse files
committed
Merge pull request #4784 from jtratner/fix-read-fwf-with-compression
BUG: Fix read_fwf with compressed files.
2 parents e461793 + c3dae26 commit 53eec08

File tree

3 files changed

+48
-7
lines changed

3 files changed

+48
-7
lines changed

doc/source/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,8 @@ Bug Fixes
369369
- Bug in ``iloc`` with a slice index failing (:issue:`4771`)
370370
- Incorrect error message with no colspecs or width in ``read_fwf``. (:issue:`4774`)
371371
- Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, :issue:`4550`)
372+
- Fixed bug with reading compressed files with ``read_fwf`` in Python 3.
373+
(:issue:`3963`)
372374

373375
pandas 0.12.0
374376
-------------

pandas/io/parsers.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pandas.core.frame import DataFrame
1515
import datetime
1616
import pandas.core.common as com
17+
from pandas.core.config import get_option
1718
from pandas import compat
1819
from pandas.io.date_converters import generic_parser
1920
from pandas.io.common import get_filepath_or_buffer
@@ -1921,11 +1922,14 @@ class FixedWidthReader(object):
19211922
"""
19221923
A reader of fixed-width lines.
19231924
"""
1924-
def __init__(self, f, colspecs, filler, thousands=None):
1925+
def __init__(self, f, colspecs, filler, thousands=None, encoding=None):
19251926
self.f = f
19261927
self.colspecs = colspecs
19271928
self.filler = filler # Empty characters between fields.
19281929
self.thousands = thousands
1930+
if encoding is None:
1931+
encoding = get_option('display.encoding')
1932+
self.encoding = encoding
19291933

19301934
if not ( isinstance(colspecs, (tuple, list))):
19311935
raise AssertionError()
@@ -1937,11 +1941,20 @@ def __init__(self, f, colspecs, filler, thousands=None):
19371941
isinstance(colspec[1], int) ):
19381942
raise AssertionError()
19391943

1940-
def next(self):
1941-
line = next(self.f)
1942-
# Note: 'colspecs' is a sequence of half-open intervals.
1943-
return [line[fromm:to].strip(self.filler or ' ')
1944-
for (fromm, to) in self.colspecs]
1944+
if compat.PY3:
1945+
def next(self):
1946+
line = next(self.f)
1947+
if isinstance(line, bytes):
1948+
line = line.decode(self.encoding)
1949+
# Note: 'colspecs' is a sequence of half-open intervals.
1950+
return [line[fromm:to].strip(self.filler or ' ')
1951+
for (fromm, to) in self.colspecs]
1952+
else:
1953+
def next(self):
1954+
line = next(self.f)
1955+
# Note: 'colspecs' is a sequence of half-open intervals.
1956+
return [line[fromm:to].strip(self.filler or ' ')
1957+
for (fromm, to) in self.colspecs]
19451958

19461959
# Iterator protocol in Python 3 uses __next__()
19471960
__next__ = next
@@ -1959,7 +1972,8 @@ def __init__(self, f, **kwds):
19591972
PythonParser.__init__(self, f, **kwds)
19601973

19611974
def _make_reader(self, f):
1962-
self.data = FixedWidthReader(f, self.colspecs, self.delimiter)
1975+
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
1976+
encoding=self.encoding)
19631977

19641978

19651979
##### deprecations in 0.12 #####

pandas/io/tests/test_parsers.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,6 +2028,31 @@ def test_fwf_regression(self):
20282028
res = df.loc[:,c]
20292029
self.assert_(len(res))
20302030

2031+
def test_fwf_compression(self):
2032+
try:
2033+
import gzip
2034+
import bz2
2035+
except ImportError:
2036+
raise nose.SkipTest("Need gzip and bz2 to run this test")
2037+
2038+
data = """1111111111
2039+
2222222222
2040+
3333333333""".strip()
2041+
widths = [5, 5]
2042+
names = ['one', 'two']
2043+
expected = read_fwf(StringIO(data), widths=widths, names=names)
2044+
if compat.PY3:
2045+
data = bytes(data, encoding='utf-8')
2046+
for comp_name, compresser in [('gzip', gzip.GzipFile),
2047+
('bz2', bz2.BZ2File)]:
2048+
with tm.ensure_clean() as path:
2049+
tmp = compresser(path, mode='wb')
2050+
tmp.write(data)
2051+
tmp.close()
2052+
result = read_fwf(path, widths=widths, names=names,
2053+
compression=comp_name)
2054+
tm.assert_frame_equal(result, expected)
2055+
20312056
def test_verbose_import(self):
20322057
text = """a,b,c,d
20332058
one,1,2,3

0 commit comments

Comments
 (0)